[med-svn] [Git][med-team/proteinortho][upstream] New upstream version 6.0.8+dfsg

Wed Sep 25 16:01:45 BST 2019


Andreas Tille pushed to branch upstream at Debian Med / proteinortho


Commits:
0d5a5400 by Andreas Tille at 2019-09-25T14:06:39Z
New upstream version 6.0.8+dfsg
- - - - -


7 changed files:

- .gitlab-ci.yml
- CHANGELOG
- CHANGEUID
- Makefile
- README.md
- proteinortho6.pl
- src/proteinortho_ffadj_mcs.py


Changes:

=====================================
.gitlab-ci.yml
=====================================
@@ -17,20 +17,17 @@ gcc-latest-alloptions:
   - tar xzf diamond-linux64.tar.gz
   - mkdir ~/bin
   - cp diamond ~/bin
-  - perl proteinortho*pl -project=testasd -cpus=1 -ram=100 -verbose=2 -selfblast -silent -force -desc -checkfasta -cleanblast -debug -binpath=~/bin -tmp='~/' -e=0.000001 -sim=0.9 -identity=20 -cov=30 -subparaBlast='--more-sensitive' -synteny -dups=1 -cs=4 -alpha=0.4 -conn=0.01 -purity=0.00001 -minspecies=2 -subparaCluster='-cpus 1 -seed 1' -nograph -singles -xml -exactstep3 test/*faa >/dev/null 2>&1 && rm testasd*info* && export LC_NUMERIC="C" && export LC_ALL="C" && for f in testasd.*; do sort $f >$f.testasd; done; sha256sum -b *testasd | tr -d '\n' | awk '{if($0 == "eb88ba29afd4f2dba16d3dbf97a5b0d2ab7686654a854f8502f0e778628e7f56 *testasd.descriptions.testasd120f22094e2d6a75fb650523c7b5c2763a316aa7f8884dff0cbe3ccd002c9e1e *testasd.ffadj-graph.testasd9ad470e29be4937c6f4996f80221ede51670824bb2e4bb4a50946062a130ffd7 *testasd.poff.html.testasd4f8263bb4b2738e528635f3e121c659407119a1aecafb5340c9d28f5bd66cdaf *testasd.poff.tsv.testasd26d7f5d7b87dd7b71b4920753dc65e7c303e89cdfa56d3aaf00033c7918e6d10 *testasd.poff.tsv.xml.testasdf80df4c1a951bfb55b02300a273f6395694f01e8ae908e296d9c14a847d432ac *testasd.proteinortho.html.testasdfa18e9a0530f5a5754f045cfe97deaf818bdb5eb725619952633f1da0641cf7b *testasd.proteinortho.tsv.testasdc598b8c43e48e06614ec19e2f6b870e2737a7117a50ab2b1613880764d0884b2 *testasd.proteinortho.tsv.xml.testasd"){print $0." -> OK"; exit 0}else{print $0." -> failed"; exit 1}}'
+  - perl proteinortho*pl -project=testasd -cpus=1 -ram=100 -verbose=2 -selfblast -silent -force -desc -checkfasta -cleanblast -debug -binpath=~/bin -tmp='~/' -e=0.000001 -sim=0.9 -identity=20 -cov=30 -subparaBlast='--more-sensitive' -synteny -dups=1 -cs=4 -alpha=0.4 -conn=0.01 -purity=0.00001 -minspecies=2 -subparaCluster='-cpus 1 -seed 1' -nograph -singles -xml -exactstep3 test/*faa >/dev/null 2>&1 && rm testasd*poff* && rm testasd*fadj* && rm testasd*info* && export LC_NUMERIC="C" && export LC_ALL="C" && for f in testasd.*; do sort $f >$f.testasd; done; sha256sum -b *testasd | tr -d '\n' | awk '{if($0 == "eb88ba29afd4f2dba16d3dbf97a5b0d2ab7686654a854f8502f0e778628e7f56 *testasd.descriptions.testasdf80df4c1a951bfb55b02300a273f6395694f01e8ae908e296d9c14a847d432ac *testasd.proteinortho.html.testasdfa18e9a0530f5a5754f045cfe97deaf818bdb5eb725619952633f1da0641cf7b *testasd.proteinortho.tsv.testasdc598b8c43e48e06614ec19e2f6b870e2737a7117a50ab2b1613880764d0884b2 *testasd.proteinortho.tsv.xml.testasd"){print $0." -> OK"; exit 0}else{print $0." -> failed"; exit 1}}'
 
 gcc-latest-all-p:
   image: gcc
   stage: recompile-and-test
   script:
-  #- apt-get -y install libboost-all-dev
   - export CWD=$(pwd)
   - echo "installing last"
   - wget http://last.cbrc.jp/last-982.zip && unzip last*zip 2>/dev/null && cd last*/ && make && cp src/last* $HOME
   - cd $CWD && echo "installing usearch"
   - curl https://drive5.com/cgi-bin/upload3.py?license=2019070410321731111 --output $HOME/usearch && chmod +x $HOME/usearch
-  #- echo "installing rapsearch"
-  #- git clone https://github.com/zhaoyanswill/RAPSearch2 && cd RAP*/Src && make && mv *rapsearch* $HOME && cd ../../ 
   - cd $CWD && echo "installing mmseqs2"
   - git clone https://github.com/soedinglab/MMseqs2 && cd MMs* && cmake . && make && cp src/mmseqs $HOME && cd ..
   - cd $CWD && echo "installing blat"


=====================================
CHANGELOG
=====================================
@@ -221,3 +221,6 @@
 	updated shebang of ffadj such that python2.7 is used directly (ffadj fails if called with higher version of python)
 	-p=blastp is now alias of blastp+ and legacy blast is now -p=blastp_legacy (blastn is equivalent)
 	Makefile: static now includes -lquadmath
+    25. Sept (uid: 3899)
+	synteny update to python3 (but the code looks fishy, the -synteny option now gets a deprecated warning)
+	proteinortho now only print html for <10 files automatically and otherwise only gives the option


=====================================
CHANGEUID
=====================================
@@ -1 +1 @@
-3813
+3899


=====================================
Makefile
=====================================
@@ -4,22 +4,22 @@
 # Run 'make' for compiling everything in the current directory (using the installed version of lapack in e.g. /usr/lib/, you can install lapack with e.g. apt-get install libatlas3-base or liblapack3)
 # Run 'make STATIC=TRUE' for a static version
 # Run 'make USELAPACK=FALSE' for a version without(!) LAPACK (only power iteration is used)
-# Run 'make USEPRECOMPILEDLAPACK=FALSE' for directly recompiling the provided lapack version 3.8.0 and linking dynamically 
+# Run 'make USEPRECOMPILEDLAPACK=FALSE' for directly recompiling the provided lapack version 3.8.0 and linking dynamically
 
 # Run 'make CXX=g++-7' for using the g++-7 compiler. See Flags below for more informations
-# Run 'make CXX=clang++' for using the clang compiler 
+# Run 'make CXX=clang++' for using the clang compiler
 
 # Run 'make install' for installing the compiled files to /usr/local/bin
 # Run 'make install PREFIX=/home/paul/bin/' for local installation
 
-############ FLAGS: ##########################################
+############ OPTIONS: ##########################################
 ## STATIC=TRUE : enable static compiling (default:FALSE) 
 ## CXX=g++ : the g++ compiler
 ## CXXFLAGS = compiler flags passed to g++
 ## CXXLIBRARY = the path to the libs like lapack,... (dont forget the -L)
 ## CXXINCLUDE = include path (^) (dont forget the -I)
 ## PREFIX = the installation prefix (only for make install)
-##############################################################
+################################################################
 
 ##########################
 ## enviroment variables ##
@@ -32,12 +32,24 @@ USELAPACK=TRUE
 # compile statically
 STATIC=FALSE
 
+ifdef static
+STATIC=$(static)
+endif
 ifdef PREFIX
 INSTALLDIR=$(PREFIX)
 endif
+ifdef prefix
+INSTALLDIR=$(prefix)
+endif
+ifdef installdir
+INSTALLDIR=$(installdir)
+endif
 ifdef LAPACK
 USELAPACK=$(LAPACK)
 endif
+ifeq ($(STATIC),true)
+STATIC=TRUE
+endif
 
 USEPRECOMPILEDLAPACK=TRUE
 
@@ -132,7 +144,7 @@ ifeq ($(USELAPACK),TRUE)
 ifeq ($(USEPRECOMPILEDLAPACK),TRUE)
 ifeq ($(STATIC),TRUE)
 	@echo "[ 20%] Building **proteinortho_clustering** with LAPACK (static linking)";
-	@$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp  -o $@ $< $(LDFLAGS) $(LDLIBS) -static -Wl,--allow-multiple-definition -llapack -lblas -lgfortran -lquadmath -pthread -Wl,--whole-archive -lpthread -Wl,--no-whole-archive && ([ $$? -eq 0 ] ) || ( \
+	@$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp  -o $@ $< $(LDFLAGS) $(LDLIBS) -static -Wl,--allow-multiple-definition -llapack -lblas -lgfortran -pthread -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -lquadmath && ([ $$? -eq 0 ] ) || ( \
 		echo "......$(ORANGE)static linking failed, now I try dynamic linking.$(NC)"; \
 		$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp  -o $@ $< $(LDFLAGS) $(LDLIBS) -llapack -lblas -pthread -Wl,--whole-archive -lpthread -Wl,--no-whole-archive && ([ $$? -eq 0 ] && echo "......OK dynamic linking was successful for proteinortho_clustering!";) || ( \
 			echo "......$(ORANGE)dynamic linking failed too, now I try dynamic linking without -WL,-whole-archive (this should now work for OSX).$(NC)"; \
@@ -216,15 +228,15 @@ install: proteinortho6.pl proteinortho $(BUILDDIR)/proteinortho_extract_from_gra
 	@echo "INSTALLING everything to $(INSTALLDIR)"
 	@install -v $^ $(INSTALLDIR);
 	@echo "$(GREEN)Everything installed successfully to $(INSTALLDIR).$(NC)"
-	@echo "If needed you can add $(INSTALLDIR) to \$$PATH with 'export PATH=\$$PATH:$(INSTALLDIR)'." 
+	@echo "If needed you can add $(INSTALLDIR) to \$$PATH with 'export PATH=\$$PATH:$(INSTALLDIR)'."
 
 .PHONY: test
-test: proteinortho6.pl test_step2 test_step3 test_clean
-	@echo "[TEST] All tests $(GREEN)passed$(NC)"	
+test: proteinortho6.pl test_clean test_step2 test_step3 test_clean2
+	@echo "[TEST] All tests $(GREEN)passed$(NC)"
 
 .PHONY: test_step2
 test_step2: proteinortho6.pl
-	@echo "[TEST] 1. basic proteinortho6.pl -step=2 test. (algorithms that are not present are skipped)"	
+	@echo "[TEST] 1. basic proteinortho6.pl -step=2 test. (algorithms that are not present are skipped)"
 	@echo -n " [1/12] -p=blastp+ test: "
 	@if [ "$(shell which blastp)" = "" ]; then\
 		echo "$(ORANGE)blastp missing, skipping...$(NC)"; \
@@ -234,15 +246,15 @@ test_step2: proteinortho6.pl
 		echo "$(GREEN)passed$(NC)"; \
 	fi
 
-	@echo -n " [2/12] -p=blastp+ synteny (PoFF) test: "
-	@if [ "$(shell which blastp)" = "" ]; then\
-		echo "$(ORANGE)blastp missing, skipping...$(NC)"; \
-	else \
-		./proteinortho6.pl -silent -force -project=test_synteny -synteny -singles -p=blastp+ test/*.faa; \
-		set -e ; ./src/chk_test.pl test_synteny.proteinortho.tsv; \
-		set -e ; ./src/chk_test.pl test_synteny.poff.tsv; \
-		echo "$(GREEN)passed$(NC)"; \
-	fi
+#	@echo -n " [2/12] -p=blastp+ synteny (PoFF) test: "
+#	@if [ "$(shell which blastp)" = "" ]; then\
+#		echo "$(ORANGE)blastp missing, skipping...$(NC)"; \
+#	else \
+#		./proteinortho6.pl -silent -force -project=test_synteny -synteny -singles -p=blastp+ test/*.faa; \
+#		set -e ; ./src/chk_test.pl test_synteny.proteinortho.tsv; \
+#		set -e ; ./src/chk_test.pl test_synteny.poff.tsv; \
+#		echo "$(GREEN)passed$(NC)"; \
+#	fi
 
 	@echo -n " [3/12] -p=diamond test: "
 	@if [ "$(shell which diamond)" = "" ]; then\
@@ -349,11 +361,16 @@ test_step3: proteinortho6.pl test_step2
 	@echo "$(GREEN)passed$(NC)"
 
 .PHONY: test_clean
-test_clean: 
+test_clean:
+	@echo "[TEST] Clean up all test files..."; \
+	rm -rf proteinortho_cache_test_* test.* test_* test/C.faa.* test/E.faa.* test/C2.faa.* test/L.faa.* test/M.faa.*> /dev/null 2>&1;
+
+.PHONY: test_clean2
+test_clean2:
 	@echo "[TEST] Clean up all test files..."; \
 	rm -rf proteinortho_cache_test_* test.* test_* test/C.faa.* test/E.faa.* test/C2.faa.* test/L.faa.* test/M.faa.*> /dev/null 2>&1;
 
 .PHONY: clean
-clean: 
+clean:
 	rm -rf src/BUILD test/C.faa.* test/E.faa.* test/C2.faa.* test/L.faa.* test/M.faa.*
 	rm -rf src/lapack-3.8.0/


=====================================
README.md
=====================================
@@ -1,7 +1,7 @@
 # Proteinortho
 
  Proteinortho is a tool to detect orthologous genes within different species. For doing so, it compares similarities of given gene sequences and clusters them to find significant groups. The algorithm was designed to handle large-scale data and can be applied to hundreds of species at one. Details can be found in <a href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-124">Lechner et al., BMC Bioinformatics. 2011 Apr 28;12:124.</a>
-To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (manuscript in preparation), is already build in Proteinortho. The general workflow of proteinortho is depicted [![here](https://www.dropbox.com/s/7ubl1ginn3fmf8k/proteinortho_workflow.jpg?dl=0)].
+To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (doi:10.1371/journal.pone.0105015), is already build in Proteinortho. The general workflow of proteinortho is depicted [![here](https://www.dropbox.com/s/7ubl1ginn3fmf8k/proteinortho_workflow.jpg?dl=0)].
 
 # New Features of Proteinortho Version 6!
 
@@ -19,7 +19,7 @@ To enhance the prediction accuracy, the relative order of genes (synteny) can be
 
 # Continuous Integration
 supports
-The badge 
+The badge
 [![pipeline status](https://gitlab.com/paulklemm_PHD/proteinortho/badges/master/pipeline.svg)](https://gitlab.com/paulklemm_PHD/proteinortho/commits/master) indicates the current status of the continuous integration (CI) among various platforms (ubuntu, centos, debian, fedora) and GNU c++ versions (5, 6, latest)
 The whole git repository gets deployed on a clean docker imager (gcc:latest,gcc:5,ubuntu:latest,fedora:latest,debian:latest,centos:latest) and compiled (make all) and tested (make test). The badge is green only if all test are passed. For more information see [Continuous Integration (proteinortho wiki)](https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Continuous%20Integration).
 
@@ -48,17 +48,17 @@ If you cannot execute the src/BUILD/Linux_x86_64/proteinortho_clustering, then y
 #### Easy installation with (bio)conda (for Linux + OSX)
 
     conda install proteinortho
-    
+
 If you need conda (see [here](https://docs.anaconda.com/anaconda/install/)) and the bioconda channel: `conda config --add channels defaults && conda config --add channels bioconda && conda config --add channels conda-forge`.
- 
+
 [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/proteinortho/README.html) [![alt](https://img.shields.io/conda/dn/bioconda/proteinortho.svg?style=flat)](https://bioconda.github.io/recipes/proteinortho/README.html)
 
-<br> 
+<br>
 
 #### Easy installation with brew (for OSX)
 
     brew install proteinortho
-    
+
 If you need brew (see [here](https://brew.sh/index_de))
 
 [![install with brew](https://img.shields.io/badge/install%20with-brew-brightgreen.svg?style=flat)](https://formulae.brew.sh/formula/proteinortho) [![dl](https://img.shields.io/badge/dynamic/json.svg?label=downloads&query=$[%27analytics%27][%27install%27][%27365d%27][%27proteinortho%27]&url=https%3A%2F%2Fformulae.brew.sh%2Fapi%2Fformula%2Fproteinortho.json&color=green)](https://formulae.brew.sh/formula/proteinortho)
@@ -94,8 +94,8 @@ Proteinortho uses standard software which is often installed already or is part
 
 <details>
   <summary>To <b>run</b> Proteinortho, you need: (Click to expand)</summary>
-  
-   
+
+
    - At least one of the following the following programs (default is diamond):
 
      - NCBI BLAST+ or NCBI BLAST legacy (to test this, type tblastn. apt-get install ncbi-blast+)
@@ -108,7 +108,7 @@ Proteinortho uses standard software which is often installed already or is part
      - blat (http://hgdownload.soe.ucsc.edu/admin/)
      - mmseqs2 (conda install mmseqs2, https://github.com/soedinglab/MMseqs2)
    - Perl v5.08 or higher (to test this, type perl -v in the command line)
-   - Python v2.6.0 or higher to include synteny analysis (to test this, type 'python -V' in the command line) 
+   - (optional) Python v3.0 or higher to include synteny analysis (to test this, type 'python -V' in the command line)
    - Perl standard modules (these should come with Perl): Thread::Queue, File::Basename, Pod::Usage, threads (if you miss one just install with `cpan install ...` )
 </details>
 
@@ -117,11 +117,11 @@ Proteinortho uses standard software which is often installed already or is part
   <summary>To <b>compile</b> Proteinortho (linux/osx), you need: (Click to expand)</summary>
 
    - GNU make (to test this, type 'make' in the command line)
-   - GNU g++ v4.1 or higher (to test this, type 'g++ --version' in the command line) 
-   - openmp (to test this, type 'g++ -fopenmp' in the command line) 
+   - GNU g++ v4.1 or higher (to test this, type 'g++ --version' in the command line)
+   - openmp (to test this, type 'g++ -fopenmp' in the command line)
    - (optional) gfortran for compiling LAPACK (to test this, type 'whereis gfortran' in the command line)
    - (optional) CMake for compiling LAPACK (to test this, type 'cmake' in the command line), OR you can use your own compiled version of lapack (you can get this with 'apt-get install liblapack3') and run 'make USEPRECOMPILEDLAPACK=TRUE'
-   
+
 </details>
 
 <br>
@@ -129,23 +129,23 @@ Proteinortho uses standard software which is often installed already or is part
 #### 2. Building and installing proteinortho from source (linux and osx)
 
   Here you can use a working lapack library, check this with 'dpkg --get-selections | grep lapack'. Install lapack e.g. with 'apt-get install libatlas3-base' or liblapack3.
-  
+
   If you dont have Lapack, then 'make' will automatically compiles Lapack v3.8.0 for you !
 
-  Fetch the latest source code archive downloaded from <a href="https://gitlab.com/paulklemm_PHD/proteinortho/-/archive/master/proteinortho-master.zip">here</a> 
+  Fetch the latest source code archive downloaded from <a href="https://gitlab.com/paulklemm_PHD/proteinortho/-/archive/master/proteinortho-master.zip">here</a>
 <details> <summary>or from here (Click to expand)</summary>
 
   > git clone https://gitlab.com/paulklemm_PHD/proteinortho
-  
+
   > wget https://gitlab.com/paulklemm_PHD/proteinortho/-/archive/master/proteinortho-master.zip
 </details>
 <br>
-  
+
   - `tar -xzvf proteinortho*.tar.gz` or `unzip proteinortho*.zip` : Extract the files
   - `cd proteinortho*` : Change directory into the extracted folder
   - You can now run proteinortho6.pl directly (linux only).
   - `make clean && make` : If you want to recompile Proteinortho. (For osx you need a newer g++ compiler to support multithreading, see below)
-  - `make install` or `make install PREFIX=~/bin` if you dont have root privileges. 
+  - `make install` or `make install PREFIX=~/bin` if you dont have root privileges.
   - `make test` : To make sure Proteinortho works as expected. The output should look like below (3. Make test output).
 
 <details>
@@ -157,7 +157,7 @@ Install a newer g++ compiler for -fopenmp support (multithreading) with brew (ge
 brew install gcc --without-multilib
 ```
 
-Then you should have a g++-7 or whatever newer version that there is (g++-8,9,...). 
+Then you should have a g++-7 or whatever newer version that there is (g++-8,9,...).
 Next you have to tell make to use this new compiler with one of the following:
 ```
 ln -s /usr/local/bin/gcc-7 /usr/local/bin/gcc
@@ -217,7 +217,7 @@ If you have problems compiling/running the program go to [Troubleshooting (prote
   > **proteinortho6.pl [options] \<fasta file(s)\>** (one fasta for each species, at least 2)
 
   OR
-  
+
   > **proteinortho [options] \<fasta file(s)\>**
 
 # DESCRIPTION
@@ -228,8 +228,7 @@ If you have problems compiling/running the program go to [Troubleshooting (prote
   one. Details can be found in Lechner et al., BMC Bioinformatics. 2011 Apr
   28;12:124. To enhance the prediction accuracy, the relative order of genes
   (synteny) can be used as additional feature for the discrimination of
-  orthologs. The corresponding extension, namely PoFF (manuscript in
-  preparation), is already build in Proteinortho.
+  orthologs. The corresponding extension, namely PoFF (doi:10.1371/journal.pone.0105015), is already build in Proteinortho.
 
   Proteinortho assumes, that you have all your gene sequences in FASTA
   format either represented as amino acids or as nucleotides. The source
@@ -240,7 +239,7 @@ If you have problems compiling/running the program go to [Troubleshooting (prote
   -p=blastn+ (or some other algorithm). (In case you have only have NCBI
   BLAST legacy installed, you need to tell this too - either by adding
   -p=blastp or -p=blastn respectively.) The full command for the example
-  files would thus be 
+  files would thus be
   > proteinortho6.pl -project=test test/C.faa test/E.faa
 
   test/L.faa test/M.faa. Instead of naming the FASTA files one by one, you
@@ -253,10 +252,10 @@ If you have problems compiling/running the program go to [Troubleshooting (prote
 
 Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/proteinortho](http://lechnerlab.de/proteinortho/) online for an interactiv exploration of the different options of proteinortho.
 
-# OPTIONS 
+# OPTIONS
 
  **Main parameters** (can be used with -- or -)
- 
+
    - **--project**=name (default: myproject)
     prefix for all resulting file names
 
@@ -290,34 +289,34 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
 
 <details>
   <summary>(Click to expand)</summary>
-  
+
   - **--p**=algorithm (default: diamond)
 
     <details>
       <summary>show all algorithms (Click to expand)</summary>
-      
+
         - blastn_legacy,blastp_legacy,tblastx_legacy : legacy blast family (shell commands: blastall -) family. The suffix 'n' or 'p' indicates nucleotide or protein input files.
 
         - blastn+,blastp+,tblastx+ : standard blast family (shell commands: blastn,blastp,tblastx)
         family. The suffix 'n' or 'p' indicates nucleotide or protein input files.
-    
+
         - diamond : Only for protein files! standard diamond procedure and for
         genes/proteins of length >40 with the additional --sensitive flag
-    
+
         - lastn,lastp : lastal. -n : dna files, -p protein files (BLOSUM62
         scoring matrix)!
-    
-        - rapsearch : Only for protein files! 
-    
+
+        - rapsearch : Only for protein files!
+
         - mmseqsp,mmseqsn : mmseqs2. -n : dna files, -p protein files
-    
+
         - topaz : Only for protein files!
-    
+
         - usearch : usearch_local procedure with -id 0 (minimum identity
         percentage).
-    
+
         - ublast : usearch_ublast procedure.
-    
+
         - blatp,blatn : blat. -n : dna files, -p protein files
     </details>
     <br>
@@ -344,11 +343,12 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
 <br>
 
  **Synteny options (optional, step 2)**
+  (This option is deprecated)
   (output: <myproject>.ffadj-graph, <myproject>.poff.tsv (tab separated file)-graph)
 
 <details>
   <summary>(Click to expand)</summary>
-  
+
   - **--synteny**
     activate PoFF extension to separate similar by contextual adjacencies
     (requires .gff for each .fasta)
@@ -370,7 +370,7 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
 
 <details>
   <summary>(Click to expand)</summary>
-      
+
   - **--singles**
     report singleton genes without any hit
 
@@ -387,7 +387,7 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
     do not generate *-graph file (pairwise orthology relations)
 
   - **--subparaCluster**='options'
-    additional parameters for the clustering algorithm (proteinortho_clustering) example -subparaCluster='-maxnodes 10000'. 
+    additional parameters for the clustering algorithm (proteinortho_clustering) example -subparaCluster='-maxnodes 10000'.
     Note: -rmgraph cannot be set. All other parameters of subparaCluster are replacing the default values (like -cpus or -minSpecies)
 
   - **--xml**
@@ -405,10 +405,10 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
 <br>
 
  **Misc options**
- 
+
 <details>
   <summary>(Click to expand)</summary>
-      
+
   - **--cleanblast**
     cleans blast-graph with proteinortho_cleanupblastgraph
 
@@ -451,7 +451,7 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
   copies in the presence of two very similar paralogs (default), or just to
   reduce noise in the predictions by detecting multiple copies of genomic
   areas (add the parameter -dups=3). Please note that you need additional
-  data to include synteny, namely the gene positions in GFF3 format. 
+  data to include synteny, namely the gene positions in GFF3 format.
   AsProteinortho is primarily made for proteins, it will only accept GFF
   entries of type CDS (column #3 in the GFF-file). The attributes column
   (#9) must contain Name=GENE IDENTIFIER where GENE IDENTIFIER corresponds
@@ -466,13 +466,13 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
   test.poff.tsv (tab separated file). This file is equivalent to the test.proteinortho.tsv file (above) but
   can be considered more accurate as synteny was involved for its
   construction.
-  
+
 # Output
  **BLAST Search (step 1-2)**
- 
+
 <details>
   <summary>myproject.blast-graph (Click to expand)</summary>
-  
+
     filtered raw blast data based on adaptive reciprocal best blast
     matches (= reciprocal best match plus all reciprocal matches within a
     range of 95% by default) The first two rows are just comments
@@ -487,7 +487,7 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
     evalue_ab, bitscore_ab, evalue_ba and bitscore_ba.
 
       # file_a    file_b
-      # a   b     evalue_ab     bitscore_ab   evalue_ba     bitscore_ba 
+      # a   b     evalue_ab     bitscore_ab   evalue_ba     bitscore_ba
       # E.faa     C.faa   
       # 3.8e-124        434.9   2.8e-126        442.2
       E_11  C_11  5.9e-51 190.7   5.6e-50 187.61
@@ -495,7 +495,7 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
       ...
  </details>
  <br>
-  
+
  **Clustering (step 3)**
 
 <details>
@@ -542,9 +542,9 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
     The html version of the myproject.proteinortho.tsv file
  </details>
  <br>
-  
+
  **POFF (-synteny)**
- 
+
   The synteny based graph files (myproject.ffadj-graph and
   myproject.poff.tsv (tab separated file)-graph) have two additional columns: same_strand and
   simscore. The first one indicates if two genes from a match are located at
@@ -564,7 +564,7 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
 
  </details>
  <br>
-  
+
   <details>
   <summary>myproject.poff.tsv (tab separated file)-graph (Click to expand)</summary>
 
@@ -573,7 +573,7 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
 
  </details>
  <br>
-  
+
 
 # EXAMPLES
  **Calling proteinortho**
@@ -581,14 +581,14 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
   test/
 
   test/C.faa:
-  
+
     >C_10
     VVLCRYEIGGLAQVLDTQFDMYTNCHKMCSADSQVTYKEAANLTARVTTDRQKEPLTGGY
     HGAKLGFLGCSLLRSRDYGYPEQNFHAKTDLFALPMGDHYCGDEGSGNAYLCDFDNQYGR
     ...
 
    test/E.faa:
-   
+
     >E_10
     CVLDNYQIALLRNVLPKLFMTKNFIEGMCGGGGEENYKAMTRATAKSTTDNQNAPLSGGF
     NDGKMGTGCLPSAAKNYKYPENAVSGASNLYALIVGESYCGDENDDKAYLCDVNQYAPNV
@@ -634,7 +634,7 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
 
 ## Example 1
 
-In the following example a huge blast graph is used for step 3 (clustering). 
+In the following example a huge blast graph is used for step 3 (clustering).
 The first connected component contains 7410694 nodes, hence the kmere heuristic is activated.
 Since the fiedler vector would result in a good split, the kmere heuristic is then deactivated immediatly.
 
@@ -657,7 +657,7 @@ Since the fiedler vector would result in a good split, the kmere heuristic is th
     ...
 
 </details>
-    
+
 <details>
 <summary>example for large graphs, where kmere is tested but not needed (Click to expand)</summary>
 
@@ -692,5 +692,3 @@ Since the fiedler vector would result in a good split, the kmere heuristic is th
   Lechner, M., Findeisz, S., Steiner, L., Marz, M., Stadler, P. F., &
   Prohaska, S. J. (2011). Proteinortho: detection of (co-) orthologs in
   large-scale analysis. BMC bioinformatics, 12(1), 124.
-
-


=====================================
proteinortho6.pl
=====================================
@@ -3,7 +3,7 @@
 ##########################################################################################
 #   This file is part of Proteinortho.
 #   (C) 2009/2010 Marcus Lechner
-# 
+#
 #   Proteinortho is free software; you can redistribute it and/or modify
 #   it under the terms of the GNU General Public License as published
 #   by the Free Software Foundation; either version 2, or (at your
@@ -17,7 +17,7 @@
 #   You should have received a copy of the GNU General Public License
 #   along with Proteinortho; see the file COPYING.  If not, write to the
 #   Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-#   Boston, MA 02111-1307, USA. 
+#   Boston, MA 02111-1307, USA.
 ##########################################################################################
 ##########################################################################################
 # About
@@ -25,7 +25,7 @@
 # Proteinortho
 # input fasta files with proteins
 # output matrix with orthologous proteins
-# 
+#
 # @authors Marcus Lechner, Clemens Elias Thoelken, Paul Klemm
 # @email lechner\@staff.uni-marburg.de
 # @company University of Maruburg
@@ -33,7 +33,7 @@
 #
 ##########################################################################################
 
-##MARK_FOR_NEW_BLAST_ALGORITHM => FLAG for adding a new blast algorithm. 
+##MARK_FOR_NEW_BLAST_ALGORITHM => FLAG for adding a new blast algorithm.
 
 ### ============================================================================
 ### MAN-PAGE
@@ -215,7 +215,7 @@ do not generate .graph file (pairwise orthology relations)
 
 =item B<--subparaCluster>='options'
 
-additional parameters for the clustering algorithm (proteinortho_clustering) example -subparaCluster='-maxnodes 10000'. 
+additional parameters for the clustering algorithm (proteinortho_clustering) example -subparaCluster='-maxnodes 10000'.
 Note: -rmgraph cannot be set. All other parameters of subparaCluster are replacing the default values (like -cpus or -minSpecies)
 
 =item B<--xml>
@@ -268,7 +268,7 @@ If you want to involve multiple machines or separate a Proteinortho run into sma
 First, run 'proteinortho6.pl -steps=1 ...' to generate the indices.
 Then you can run 'proteinortho6.pl -steps=2 -jobs=B<M>/B<N> ...' to run small chunks separately.
 Instead of B<M> and B<N> numbers must be set representing the number of jobs you want to divide the run into (B<M>) and the job division to be performed by the process.
-E.g. to divide a Proteinortho run into 4 jobs to run on several machines, use 
+E.g. to divide a Proteinortho run into 4 jobs to run on several machines, use
 
 =back
 
@@ -278,7 +278,7 @@ E.g. to divide a Proteinortho run into 4 jobs to run on several machines, use
 
 =over 4
 
-=item <myproject>.B<blast-graph> 
+=item <myproject>.B<blast-graph>
 
 filtered raw blast data based on adaptive reciprocal best blast
 matches (= reciprocal best match plus all reciprocal matches within a
@@ -289,12 +289,12 @@ about to follow. E.g. # M.faa L.faa tells that the next lines representing
 results for species M and L. All matches are reciprocal matches. If
 e.g. a match for M_15 L_15 is shown, L_15 M_15 exists implicitly.
 E-Values and bit scores for both directions are given behind each
-match. The 4 comment numbers ('# 3.8e-124        434.9...') are representing the median values of  
+match. The 4 comment numbers ('# 3.8e-124        434.9...') are representing the median values of
 evalue_ab, bitscore_ab, evalue_ba and bitscore_ba.
 
   ----------------------------------------
-  # file_a  file_b 
-  # a b evalue_ab bitscore_ab evalue_ba bitscore_ba 
+  # file_a  file_b
+  # a b evalue_ab bitscore_ab evalue_ba bitscore_ba
   # E.faa C.faa
   # 3.8e-124        434.9   2.8e-126        442.2
   E_10  C_10  3.8e-124  434.9 2.8e-126  442.2
@@ -307,7 +307,7 @@ evalue_ab, bitscore_ab, evalue_ba and bitscore_ba.
 
 =over 4
 
-=item <myproject>.B<proteinortho-graph> 
+=item <myproject>.B<proteinortho-graph>
 
 clustered myproject.blast-graph. Its connected components are represented in myproject.proteinortho.tsv. The format is the same as the blast-graph (see above).
 
@@ -319,7 +319,7 @@ clustered myproject.blast-graph. Its connected components are represented in myp
   E_11  C_11  5.9e-51 190.7 5.6e-50 187.6
   ----------------------------------------
 
-=item <myproject>.B<proteinortho> 
+=item <myproject>.B<proteinortho>
 
 The connected components.
 The first line starting with #is a comment line indicating the meaning of each column for each of the following lines which represent an orthologous group each.
@@ -341,6 +341,8 @@ By default, Proteinortho splits each group into two more dense subgroups when th
 
 =head2 POFF (-synteny)
 
+-->> This option is deprecated <<---
+
 The synteny based graph files (myproject.ffadj-graph and myproject.poff-graph) have two additional columns:
 same_strand and simscore. The first one indicates if two genes from a match are located at the same strands (1) or not (-1).
 The second one is an internal score which can be interpreted as a normalized weight ranging from 0 to 1 based on the respective e-values.
@@ -355,7 +357,7 @@ Moreover, a second comment line is followed after the species lines, e.g.
 
 =item <myproject>.B>ffadj-graph>
 
-filtered blast data based on adaptive reciprocal best blast matches and synteny (only if -synteny is set) 
+filtered blast data based on adaptive reciprocal best blast matches and synteny (only if -synteny is set)
 
 =item <myproject>.B<poff-graph>
 
@@ -404,7 +406,7 @@ To do so, add the parameter -synteny.
 You can use it to either come closer to one-to-one orthology relations by preferring synthetically conserved copies in the presence of two very similar paralogs (default),
 or just to reduce noise in the predictions by detecting multiple copies of genomic areas (add the parameter -dups=3).
 Please note that you need additional data to include synteny, namely the gene positions in GFF3 format.
-As Proteinortho is primarily made for proteins, it will only accept GFF entries of type CDS (column #3 in the GFF-file). 
+As Proteinortho is primarily made for proteins, it will only accept GFF entries of type CDS (column #3 in the GFF-file).
 The attributes column (#9) must contain Name=GENE IDENTIFIER where GENE IDENTIFIER corresponds to the respective identifier in the FASTA format.
 It may not contain a semicolon (;)! Alternatively, you can also set ID=GENE IDENTIFIER.
 Example files are provided in the source code archive.
@@ -425,7 +427,7 @@ The directory src contains useful tools, e.g. proteinortho_grab_proteins.pl whic
 
 =head1 AUTHORS
 
-Marcus Lechner (lechner\@staff.uni-marburg.de), Clemens Elias Thoelken, Paul Klemm 
+Marcus Lechner (lechner\@staff.uni-marburg.de), Clemens Elias Thoelken, Paul Klemm
 
 =head1 ONLINE INFORMATION
 
@@ -453,7 +455,7 @@ use POSIX;
 ##########################################################################################
 # Variables
 ##########################################################################################
-our $version = "6.0.6";
+our $version = "6.0.8";
 our $step = 0;    # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
 our $verbose = 1; # 0/1   -> don't / be verbose
 our $debug = 0;   # 0/1   -> don't / show debug data
@@ -505,7 +507,7 @@ our $all_jobs_submitted :shared = 0;
 our $po_path = "";
 our $run_id = "";
 our %gene_counter;    # Holds the number of genes for each data file (for sorting)
-our %max_gene_length_diamond;    # Holds the maximum length of genes for each data file (for diamond -> -sensitive option) 
+our %max_gene_length_diamond;    # Holds the maximum length of genes for each data file (for diamond -> -sensitive option)
 our $threads_per_process :shared = 1; # Number of subthreads for blast
 
 our $freemem_inMB = -1; # -1 = detect automatically with 'free -m'
@@ -528,7 +530,7 @@ our $NC="\033[0m"; # No Color
 
 my $tput=`tput color 2>/dev/null`; # test if the shell supports colors
 $tput=~s/[\r\n]+$//;
-if ($tput=~m/[^0-9]|^&/ && $tput <16) {  
+if ($tput=~m/[^0-9]|^&/ && $tput <16) {
   $RED="";
   $GREEN="";
   $ORANGE="";
@@ -553,11 +555,11 @@ foreach my $option (@ARGV) {
   elsif ($option =~ m/^--?cleanblast$/)       { $checkblast = 1;  }
   elsif ($option =~ m/^--?cleanupblast$/)       { $checkblast = 1;  }
   elsif ($option =~ m/^--?verbose=(0|1)$/)    { $verbose = $1;  }
-  elsif ($option =~ m/^--?te?mp=(.+)$/)     { $tmp_path = $1; 
+  elsif ($option =~ m/^--?te?mp=(.+)$/)     { $tmp_path = $1;
                 # make sure it ends with /
                 unless ($tmp_path =~ /\/$/) {$tmp_path .= "/";}my $pwd=$ENV{"HOME"}; $tmp_path=~s/~/$pwd/g;
                 if(! -d $tmp_path || ! -R $tmp_path || ! -W $tmp_path ){
-                  &Error(" -tmp=$tmp_path is not accessible. Check if the directory exists and is read- and writable.");  
+                  &Error(" -tmp=$tmp_path is not accessible. Check if the directory exists and is read- and writable.");
                 }else{
                   if($tmp_path =~ m/proteinortho_cache/){
                     $keep=1;
@@ -573,8 +575,8 @@ foreach my $option (@ARGV) {
   elsif ($option =~ m/^--?cpus=auto$/)          { $cpus = 0; }
   elsif ($option =~ m/^--?alpha=([0-9\.]+)$/)     { $alpha = $1; }
   elsif ($option =~ m/^--?purity=([0-9\.]+)$/)      { $purity = $1; }
-  elsif ($option =~ m/^--?report=([0-9]+)$/)      { $report = $1; }  
-  elsif ($option =~ m/^--?minspecies=([0-9.]+)$/)       { if($1>=0){$minspecies = $1;}else{&Error("the argument -minspecies=$1 is invalid.$RED minspecies needs to be >=0!$NC\nminspecies: the min. number of genes per species. If a group is found with up to (minspecies) genes/species, it wont be split again (regardless of the connectivity).");} }  
+  elsif ($option =~ m/^--?report=([0-9]+)$/)      { $report = $1; }
+  elsif ($option =~ m/^--?minspecies=([0-9.]+)$/)       { if($1>=0){$minspecies = $1;}else{&Error("the argument -minspecies=$1 is invalid.$RED minspecies needs to be >=0!$NC\nminspecies: the min. number of genes per species. If a group is found with up to (minspecies) genes/species, it wont be split again (regardless of the connectivity).");} }
   elsif ($option =~ m/^--?conn=([0-9\.]+)$/)      { $connectivity = $1; }
   elsif ($option =~ m/^--?cov=([0-9]+)$/)       { $coverage = $1; }
   elsif ($option =~ m/^--?mcl$/)        { $useMcl = 1; }
@@ -588,36 +590,36 @@ foreach my $option (@ARGV) {
   elsif ($option =~ m/^--?sim=([0-9\.]+)$/)       { $sim = $1; }
   elsif ($option =~ m/^--?startat=([0-9]+)$/)     { $startat = $1; }
   elsif ($option =~ m/^--?stopat=([0-9]+)$/)      { $stopat = $1; }
-  elsif ($option =~ m/^--?jobs?=([\d]+)\/([\d]+)$/)   { $jobnumber = $1; $split_to_X_jobs = $2; }  
-  elsif ($option =~ m/^--?selfblast$/)      { $selfblast = 1; }  
-  elsif ($option =~ m/^--?selfblast=(0|1)$/)    { $selfblast = $1; }  
-  elsif ($option =~ m/^--?singles$/)      { $singles = 1; }  
-  elsif ($option =~ m/^--?singles=(0|1)$/)    { $singles = $1; }  
-  elsif ($option =~ m/^--?poff$/)       { $synteny = 1; }  
-  elsif ($option =~ m/^--?synteny$/)      { $synteny = 1; }  
-  elsif ($option =~ m/^--?synteny=(0|1)$/)    { $synteny = $1; }
-  elsif ($option =~ m/^--?dups=0$/)     { $duplication = 0; } 
+  elsif ($option =~ m/^--?jobs?=([\d]+)\/([\d]+)$/)   { $jobnumber = $1; $split_to_X_jobs = $2; }
+  elsif ($option =~ m/^--?selfblast$/)      { $selfblast = 1; }
+  elsif ($option =~ m/^--?selfblast=(0|1)$/)    { $selfblast = $1; }
+  elsif ($option =~ m/^--?singles$/)      { $singles = 1; }
+  elsif ($option =~ m/^--?singles=(0|1)$/)    { $singles = $1; }
+  elsif ($option =~ m/^--?poff$/)       { $synteny = 1; print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; }
+  elsif ($option =~ m/^--?synteny$/)      { $synteny = 1; print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; }
+  elsif ($option =~ m/^--?synteny=(0|1)$/)    { $synteny = $1; print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; }
+  elsif ($option =~ m/^--?dups=0$/)     { $duplication = 0; }
   elsif ($option =~ m/^--?dups=([1-8])$/)   { $duplication = $1+1;}
-  elsif ($option =~ m/^--?neighbourjoin$/)    { $neighbourjoin = 1; }  
+  elsif ($option =~ m/^--?neighbourjoin$/)    { $neighbourjoin = 1; }
   elsif ($option =~ m/^--?neighbourjoin=(0|1)$/)  { $neighbourjoin = $1; }
-  elsif ($option =~ m/^--?cs=([0-9]+)$/)    { $cs = $1; }  
-  elsif ($option =~ m/^--?keep$/)     { $keep = 1; }  
-  elsif ($option =~ m/^--?force$/)      { $force = 1; }  
-  elsif ($option =~ m/^--?clean$/)     { $clean = 1; }  
-  elsif ($option =~ m/^--?help$/)     { print_usage(); exit 0;}  
-  elsif ($option =~ m/^--?test$/)     { &check_bins; &get_po_path; print "All necessary proteinortho_* binaries are found.\n"; exit 0;}  
-  elsif ($option =~ m/^--?h$/)     { print_usage(); exit 0;}  
-  elsif ($option =~ m/^--?nograph$/)      { $nograph = 1; }  
-  elsif ($option =~ m/^--?xml$/)      { $doxml = 1; }  
-  elsif ($option =~ m/^--?graph$/)      { $nograph = 0; }  
-  elsif ($option =~ m/^--?desc$/)     { $desc = 1; }  
+  elsif ($option =~ m/^--?cs=([0-9]+)$/)    { $cs = $1; }
+  elsif ($option =~ m/^--?keep$/)     { $keep = 1; }
+  elsif ($option =~ m/^--?force$/)      { $force = 1; }
+  elsif ($option =~ m/^--?clean$/)     { $clean = 1; }
+  elsif ($option =~ m/^--?help$/)     { print_usage(); exit 0;}
+  elsif ($option =~ m/^--?test$/)     { &check_bins; &get_po_path; print "All necessary proteinortho_* binaries are found.\n"; exit 0;}
+  elsif ($option =~ m/^--?h$/)     { print_usage(); exit 0;}
+  elsif ($option =~ m/^--?nograph$/)      { $nograph = 1; }
+  elsif ($option =~ m/^--?xml$/)      { $doxml = 1; }
+  elsif ($option =~ m/^--?graph$/)      { $nograph = 0; }
+  elsif ($option =~ m/^--?desc$/)     { $desc = 1; }
   elsif ($option =~ m/^--?project=(.*)$/)   { $project = $1; $project=~s/[\/* \t\:\~\&\%\$\§\"\(\)\[\]\{\}\^\\]//g; }
   elsif ($option =~ m/^--?subparaBlast=(.*)$/i)  { $blastOptions = $1;}
   elsif ($option =~ m/^--?subparaCluster=(.*)$/i)  { $clusterOptions = $1;}
   elsif ($option =~ m/^--?v(ersion)?$/i)  { print $version."\n"; exit 0;}
   elsif ($option !~ /^-/)       { if(!exists($files_map{$option})){$files_map{$option}=1;push(@files,$option);}else{print STDERR "$ORANGE"."[WARNING]$NC The input $option was is skipped, since it was allready given as input.$NC\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n";sleep 10;print STDERR "Well then, proceeding...\n"} }
-  elsif ($option =~ m/^--?man$/i){ pod2usage(-exitstatus => 0, -verbose => 2) }
-  else  {&print_usage(); &reset_locale();die "Invalid command line option: \'$option\'!\n"; }
+  elsif ($option =~ m/^--?man$/i){ my $bperldoc=(1-(`perldoc -l Pod::Usage 2>&1`=~m/need to install/)); if( !$bperldoc ){ print STDERR "You need to install the perl-doc package to use this man program.\n"; }else{ pod2usage(-exitstatus => 0, -verbose => 2) } exit 0; }
+  else  {&print_usage(); &reset_locale();die "$RED"."[Error]$NC $ORANGE Invalid command line option: \'$option\'! $NC\n\n"; }
 }
 
 if($selfblast){$checkblast=1;}
@@ -626,7 +628,7 @@ $po_path = &get_po_path();    # Determine local path
 
 our $nucleotideAlphabet="ACGTURYSWKMBDHVNXacgturyswkmbdhvnx\.\-";
 our $aminoAlphabet="XOUBZACDEFGHIKLMNPQRSTVWYxoubzacdefghiklmnpqrstvwy\.\*\-";
-our $allowedAlphabet = { 
+our $allowedAlphabet = {
   'blastn_legacy' => 'n' ,
   'blastp_legacy' => 'a' ,
   'tblastx' => 'n' ,
@@ -665,7 +667,7 @@ my $restart_counter=-1; # restart only once
 # Check parameters
 ##########################################################################################
 if (defined($startat) || defined($stopat)) {
-  &Error("Sorry, -startat and -stopat were removed. Please use -jobs=M/N for more flexible job splitting.");  
+  &Error("Sorry, -startat and -stopat were removed. Please use -jobs=M/N for more flexible job splitting.");
 }
 
 if ($split_to_X_jobs == 0) {
@@ -683,7 +685,7 @@ if ($jobnumber != -1) {
 
 our $gccversionstr = `gcc --version 2>/dev/null`;
 our $gccversion_main = "";
-if ($? == 0) {  
+if ($? == 0) {
   $gccversion_main=($gccversionstr =~ m/^[^\n]+ ([\d]+)\.[\d\.]+\n/);
 }
 our $ompprocbind="close";
@@ -784,10 +786,10 @@ if ($step == 0 || $step == 1) {
 
 # Step 2, run blast and synteny algorithm
 if ($step == 0 || $step == 2) {
-  if($verbose){print STDERR "\n$GREEN**Step 2**$NC using $blastmode "; 
+  if($verbose){print STDERR "\n$GREEN**Step 2**$NC using $blastmode ";
   if($synteny || $selfblast){print STDERR "with :";}
   if($synteny){print STDERR " synteny";}
-  if($selfblast){print STDERR " selfblast";} 
+  if($selfblast){print STDERR " selfblast";}
   print STDERR "\n"; }
 
   &init_graph_output; # Initiate Output file(s)
@@ -874,7 +876,7 @@ sub cluster {
     if($verbose){print STDERR "$ORANGE"."[WARNING]$NC Using MCL for clustering (-ram has no effect).$NC\n";} # minimum 5 MB
   }
 
-  if(!$useMcl){ 
+  if(!$useMcl){
     system ("OMP_PROC_BIND=$ompprocbind $po_path/proteinortho_clustering -minspecies $minspecies -ram ".$freemem_inMB." -kmere ".(1-$exactstep3)." -debug $debug -cpus $cpus -weighted 1 -verbose $verbose -conn $connectivity -purity $purity $clusterOptions -rmgraph '$rm_simgraph' $simgraph* >'$simtable'");
     if ($? != 0) {
           &Error("proteinortho_clustering failed. Did you use the static version?\nMaybe your operating system does not support the statically compiled version, please try recompiling proteinortho with 'make clean' and 'make' (and 'make install PREFIX=...').");
@@ -889,14 +891,17 @@ sub cluster {
     system("mv mcl.proteinortho-graph $csimgraph");
   }
 
-  if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $simtable\n You can extract the fasta files of each orthology group with 'proteinortho_grab_proteins.pl -tofiles $simtable ".join(" ", at files)."'\n (Careful: This will generate a file foreach line in the file $simtable).\n";}
+  if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $simtable\n";}
+  if(scalar @files < 10){
+    if($verbose){print STDERR "You can extract the fasta files of each orthology group with 'proteinortho_grab_proteins.pl -tofiles $simtable ".join(" ", at files)."'\n (Careful: This will generate a file foreach line in the file $simtable).\n";}
+  }
   
   if ($singles) {
     if($verbose){print STDERR "Adding singles...\n";}
     my $fastas = "'".join("' '", at files)."'";
     system("$po_path/proteinortho_singletons.pl $fastas <'$simtable' >>'$simtable'");
   }
-  
+
   if (!$nograph && !$useMcl) {
     system("$po_path/proteinortho_graphMinusRemovegraph '$rm_simgraph' $simgraph* >'$csimgraph'");
     unless ($keep) {unlink($rm_simgraph);}
@@ -905,8 +910,12 @@ sub cluster {
     if($verbose){print STDERR "[OUTPUT] -> Orthologous pairs are written to $csimgraph\n";}
   }
 
-  system("perl $po_path/proteinortho2html.pl $simtable ".join(" ", at files)." >$simtablehtml");
-  if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $simtablehtml\n";}
+  if(scalar @files < 10){
+    system("perl $po_path/proteinortho2html.pl $simtable ".join(" ", at files)." >$simtablehtml");
+    if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $simtablehtml\n";}
+  }else{
+    if($verbose){print STDERR "[OUTPUT] -> You can extract a html version of the output using :\nproteinortho2html.pl $simtable [PLACE FASTA FILES HERE] >$simtablehtml\n\n";}
+  }
 
   if ($doxml) {
     system("perl $po_path/proteinortho2xml.pl $simtable >$simtable.xml");
@@ -928,7 +937,7 @@ sub cluster {
     }
 
     if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $syntable\nYou can extract the fasta files of each orthology group with 'proteinortho_grab_proteins.pl -tofiles $syntable ".join(" ", at files)."'\n(Careful: This will generate a file foreach line in the file $syntable).\n";}
-    
+
     if ($singles) {
       if($verbose){print STDERR "Adding singles...\n";}
       my $fastas = "'".join("' '", at files)."'";
@@ -962,14 +971,14 @@ sub print_header {
 
 
 sub print_usage {
-print STDERR "      
+print STDERR "
      |
   ${BLUE}  /${NC} ${RED2}\\  $NC
   ${BLUE} /\\${NC}${RED2} /\\ $NC
   ${BLUE}/ ${RED2}/${BLUE} \\ ${RED2}\\${NC}
 
 Usage: proteinortho6.pl [OPTIONS] FASTA1 FASTA2 [FASTA...] (one for each species, at least 2)
-Options: 
+Options:
          [General options]
          -project=    prefix for all result file names [default: myproject]
          -cpus=       number of processors to use [default: auto]
@@ -991,8 +1000,8 @@ Options:
                       blast*+|tblastx+ : standard blastal family (blastp+ : protein files, blastn+ : dna files)
                       blast*_legacy : legacy blast family
                       diamond : Only for protein files! standard diamond procedure and for genes/proteins of length >40 with the additional --sensitive flag
-                      usearch : usearch_local procedure with -id 0 (minimum identity percentage). 
-                      ublast : usearch_ublast procedure. 
+                      usearch : usearch_local procedure with -id 0 (minimum identity percentage).
+                      ublast : usearch_ublast procedure.
                       lastn : standard lastal. Only for dna files!
                       lastp : lastal using -p and BLOSUM62 scoring matrix. Only for protein files!
                       rapsearch : Only for protein files!
@@ -1002,7 +1011,7 @@ Options:
          -e=          E-value for blast [default: 1e-05]
 
          [Synteny options]
-         -synteny     activate PoFF extension to separate similar sequences
+         -synteny     activate PoFF extension to separate similar sequences print
                       by contextual adjacencies (requires .gff for each .fasta)
          -dups=       PoFF: number of reiterations for adjacencies heuristic,
                       to determine duplicated regions (default: 0)
@@ -1017,7 +1026,7 @@ Options:
          -xml        produces an OrthoXML formatted file of the *.proteinortho.
 
          (...)
-        
+
 For more information see the man page: 'proteinortho -man' or online: https://gitlab.com/paulklemm_PHD/proteinortho
 Or you can use the GUI proteinorthoHelper.html (available at http://lechnerlab.de/proteinortho/)
 Dou have suggestions or need more help: write a mail to lechner\@staff.uni-marburg.de.
@@ -1088,7 +1097,7 @@ sub run_blast {
 
   # Spawn worker threads
   for (my $i = 0; $i < $cpus; $i++) {threads->create('workerthread');}
-  
+
   # For each file against each other file
   my $job_number = 0;
   SPEC: for (my $i = 0; $i < scalar(@files)-1+$selfblast; $i++) {
@@ -1117,6 +1126,8 @@ sub workerthread {
   my $thread_id = threads->tid();
   my $temp_file = "$tmp_path$project-$run_id-$thread_id";
 
+  $temp_file=~s/^\.\///;
+
   # Clean up, just to be safe
   unlink("$temp_file.tmp");
   unlink("$temp_file.log");
@@ -1159,7 +1170,7 @@ sub workerthread {
     # Work
     &set_threads_per_process(scalar(threads->list()));
     my $result_ij = &blast($file_i,$file_j,$thread_id);
-  
+
     my $result_ji;
     if ($file_i eq $file_j) {
       # One run is enough (selfblast)
@@ -1189,10 +1200,9 @@ sub workerthread {
       open(PREGRAPH,">>$temp_file.tmp") || &Error("Could not open temp file '$temp_file.tmp': $!");
       print PREGRAPH $ordered_matches;
       close(PREGRAPH);
-      my $cmd = "$po_path/proteinortho_ffadj_mcs.py '$temp_file.tmp' $alpha";
-      if ($duplication) {
-        $cmd .= " --repeat-matching $duplication --min-cs-size $cs";
-      }
+      my $ffadj_param = "-a $alpha";
+      if ($duplication) { $ffadj_param .= " -R $duplication -M $cs";}
+      my $cmd = "$po_path/proteinortho_ffadj_mcs.py $ffadj_param '$temp_file.tmp'";
       if ($debug) {print STDERR "$cmd\n";}
       my $synt_stats = qx($cmd);
       $synt_stats=~s/[\r\n]+$//;
@@ -1218,11 +1228,13 @@ sub workerthread {
       my %close = %{$close_copies_pointer};
       # Generate hash for synteny hits
       my %synteny;
+
       unless (-s "$temp_file.matching") {
-        print STDERR "$RED [Error] Failed to run $po_path/proteinortho_ffadj_mcs.py for\n$file_i vs $file_j\nMoving source to $temp_file.err for debugging\nI will continue, but results may be insufficient.$NC \n";
+        print STDERR "\n$RED [Error] Failed to run $po_path/proteinortho_ffadj_mcs.py for\n$file_i vs $file_j\nMoving source to $temp_file.err for debugging\nI will continue, but results may be insufficient.$NC \n\n";
         system("mv $temp_file.tmp $temp_file.err");
         next;
       }
+
       open(OSYNGRAPH,"<$temp_file.matching") || &Error("Could not open temp file $temp_file.matching: $!'");
       while(<OSYNGRAPH>) {
           $_=~s/[\r\n]+$//;
@@ -1510,7 +1522,7 @@ sub print_blast_stats {
     lock($jobs_done);
     my $percent = int($jobs_done/$jobs_todo*10000)/100;
     print STDERR "\r                                                                               ";
-  
+
     if ($split_to_X_jobs == -1) {
       print STDERR "\rRunning blast analysis: $percent% ($jobs_done/$jobs_todo)";
     }
@@ -1573,7 +1585,7 @@ sub get_legal_matches {
 # if(($blastmode ne "phmmer" && $blastmode ne "jackhmmer")){ #workaround for jackhmmer + phmmer
     # Percent identity
     if (!$twilight && $local_identity < $identity)          {next;}
-    if ( $twilight && $local_identity < &identitybylength($alignment_length)) {next;} 
+    if ( $twilight && $local_identity < &identitybylength($alignment_length)) {next;}
     # Min. length
     if ($blastmode eq "tblastx+" || $blastmode eq "tblastx") {$alignment_length *= 3;}
     if ($alignment_length < $length{$query_id}*($coverage/100)+0.5)     {next;}
@@ -1634,7 +1646,7 @@ sub generate_indices {
 
       if(!$force && join("",glob("$file.$blastmode*")) ne ""){
         if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
-      }else{ 
+      }else{
         if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
         system("$makedb -d '$file' -n '$file.$blastmode' >\/dev\/null");
         if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";
@@ -1653,7 +1665,7 @@ sub generate_indices {
       if ($file =~ /\s/) {&Error("File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!\n");}
       if(!$force && join("",glob("$file.$blastmode*")) ne ""){
         if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
-      }else{ 
+      }else{
         if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
         system("$makedb '$file' -d '$file.$blastmode' --quiet >\/dev\/null");
         if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";
@@ -1673,7 +1685,7 @@ sub generate_indices {
 
       if(!$force && join("",glob("$file.$blastmode*")) ne ""){
         if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
-      }else{ 
+      }else{
         if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
         system("$makedb index -f '$file' -p '$file.$blastmode' >\/dev\/null 2>\/dev\/null");
         if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";
@@ -1693,7 +1705,7 @@ sub generate_indices {
 
       if(!$force && join("",glob("$file.$blastmode*")) ne ""){
         if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
-      }else{ 
+      }else{
         if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
         system("$makedb --dbtype 1 '$file' '$file.$blastmode' >\/dev\/null 2>\/dev\/null");
         if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";
@@ -1713,7 +1725,7 @@ sub generate_indices {
 
       if(!$force && join("",glob("$file.$blastmode*")) ne ""){
         if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
-      }else{ 
+      }else{
         if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
         system("$makedb --dbtype 2 '$file' '$file.$blastmode' >\/dev\/null 2>\/dev\/null");
         if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";
@@ -1724,16 +1736,16 @@ sub generate_indices {
 
           if($?!=0){ $keep=$oldkeep; &Error("The database generation failed once again, please retry with 'sudo' or move the fasta files to a directory with write permissions. If this failes too, then there is something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- consider to submitting (mailing) this case to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
         }
-      } 
+      }
     }
   }
   elsif ($blastmode eq "usearch" || $blastmode eq "ublast") {
     foreach my $file (@_) {
       if ($file =~ /\s/) {&Error("File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!\n");}
-      
+
       if(!$force && join("",glob("$file.$blastmode*")) ne ""){
         if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
-      }else{ 
+      }else{
         if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
         system("$makedb '$file' -output '$file.$blastmode' >\/dev\/null 2>\/dev\/null");
         if ($? != 0) {print STDERR ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database. Most likely you don't have write permissions in the directory of the fasta files. I will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){print STDERR "$ORANGE Please ensure that you use -tmp=$tmp_path -keep for future analysis.$NC";}print "\n";
@@ -1752,7 +1764,7 @@ sub generate_indices {
       if ($file =~ /\s/) {&Error("File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!\n");}
       if(!$force && join("",glob("$file.$blastmode*")) ne ""){
         if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
-      }else{ 
+      }else{
         if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
         if($blastmode eq "lastp"){
           system("$makedb -p '$file.$blastmode' '$file'");
@@ -1786,7 +1798,7 @@ sub generate_indices {
       if ($file =~ /\s/) {&Error("File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!\n");}
       if(!$force && join("",glob("$file.$blastmode*")) ne ""){
         if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
-      }else{ 
+      }else{
         if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
         if ($debug) {print STDERR "$makedb '$file' -out '$file.$blastmode' >\/dev\/null\n";}
         system("$makedb '$file' -out '$file.$blastmode' >\/dev\/null");
@@ -1805,7 +1817,7 @@ sub generate_indices {
       if ($file =~ /\s/) {&Error("File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behaviours, please change the file name!\n");}
       if(!$force && join("",glob("$file.$blastmode*")) ne ""){
         if ($verbose) {print STDERR "The database for '$file' is present and will be used\n";}
-      }else{ 
+      }else{
         if ($verbose) {print STDERR "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
         if ($debug) {print STDERR "$makedb '$file' -out '$file.$blastmode' >\/dev\/null\n";}
         system("$makedb '$file' -n '$file.$blastmode' >\/dev\/null");
@@ -1892,7 +1904,7 @@ sub blast {
 
   # File does not exists yet or I am forced to rewrite it
   if (!(-e $bla) || $force) {
-    
+
     if (-e $bla && $force) { system("rm $bla"); }
 
     if ($debug || $verbose==2) {print STDERR "$command\n";}                     # 5.16
@@ -1903,7 +1915,7 @@ sub blast {
         &Error($blastmode." failed.\nThe most likely  errorsources of this are:\n- no space left on device error.\n- outdated $blastmode, please update $blastmode or consider another -p algorithm.\n- the databases are missing. Maybe you ran --step=1 and removed the databases afterwards? Please rerun 'proteinortho --step=1 --force /path/to/fastas'\n- maybe the fasta files are mixed nucleotide and aminoacid sequences or just not suited for $blastmode? (For example diamond only processes protein sequences) Try 'proteinortho --step=1 --check --force /path/to/fastas'.");
       }
       if($debug eq "test_sort"){while (<"$bla.tmp">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $bla\n";&reset_locale();die;}}}
-      system("rm $bla.daa"); 
+      system("rm $bla.daa");
     }
     elsif ($blastmode eq "rapsearch") {
       system("$command");
@@ -1924,8 +1936,8 @@ sub blast {
       system("rm $bla.aln"); #remove aln file
       system("touch $bla.m8"); # rapsearch does not produces an output file if there are no hits found (<evalue threshold).
       system("tail -n +6 $bla.m82 >'$bla.tmp'"); # remove head/comment lines of rapsearch
-      system("rm $bla.m8"); 
-      system("rm $bla.m82"); 
+      system("rm $bla.m8");
+      system("rm $bla.m82");
     }
     elsif ($blastmode eq "usearch" || $blastmode eq "ublast") {
       system("$command");
@@ -1936,7 +1948,7 @@ sub blast {
       #system("perl $po_path/proteinortho_formatUsearch.pl $bla >'$bla.tmp_format'"); # problem with ublast/usearch: gene names include the description.
       #system("sort $bla.tmp_format -k12,12rg >'$bla.tmp'");
       system("perl $po_path/proteinortho_formatUsearch.pl $bla >'$bla.tmp'"); # problem with ublast/usearch: gene names include the description.
-      #system("rm $bla.tmp_format"); 
+      #system("rm $bla.tmp_format");
     }
     elsif ($blastmode eq "topaz") {
       system("$command");
@@ -2284,7 +2296,7 @@ sub read_details {
 
       if(!$force && $checkfasta && exists($allowedAlphabet->{$blastmode}) && $cur_gene_is_valid<1){last;}
       if(!$force && $checkfasta && exists($allowedAlphabet->{$blastmode}) && ($genelength>50 && ( ($allowedAlphabet->{$blastmode} eq "n" && $ATCGNoccurences/$genelength < 0.5) || ($allowedAlphabet->{$blastmode} eq "a" && $ATCGNoccurences/$genelength > 0.8)))){$cur_gene_is_valid= -1;last;}
-      
+
       $gene_counter{$file}++;
       $_ =~ s/[\r\n]+$//;#chomp only removes last \n newline, now also \r are removed and all occurences
       $_ =~ s/^>//;
@@ -2302,15 +2314,15 @@ sub read_details {
       $cur_gene_is_valid=1;
       $ATCGNoccurences=0;
       $genelength=0;
-    
-    }else{ 
+
+    }else{
 
       $_ =~ s/[\r\n]+$//;#chomp only removes last \n newline, now also \r are removed and all occurences
       if(!$force && $checkfasta&& exists($allowedAlphabet->{$blastmode})){ #test if the current gene is valid or not (for blastalgorithms that require either amino or nucleotide sequences only)
           if($allowedAlphabet->{$blastmode} eq "a"){ #"a"= aminoacid sequence
             if( $_ =~ /[^$aminoAlphabet]/){
               $cur_gene_is_valid=0;
-            } 
+            }
           }elsif($_ =~ /[^$nucleotideAlphabet]/){
               $cur_gene_is_valid=0;
           }
@@ -2344,7 +2356,7 @@ sub read_details {
 
   if(!$force && $checkfasta && $cur_gene_is_valid==1 && exists($allowedAlphabet->{$blastmode}) && ($genelength>50 && ( ($allowedAlphabet->{$blastmode} eq "n" && $ATCGNoccurences/$genelength < 0.5) || ($allowedAlphabet->{$blastmode} eq "a" && $ATCGNoccurences/$genelength > 0.8)))){$cur_gene_is_valid= -1;}
   if(!$force && $checkfasta && exists($allowedAlphabet->{$blastmode}) ){
-    
+
     if($allowedAlphabet->{$blastmode} eq "n" && $cur_gene_is_valid<1 ){
 
       if($cur_gene_is_valid==-1){
@@ -2360,7 +2372,7 @@ sub read_details {
       print STDERR "\nWell then, proceeding...\n\n";
         goto RESTART;
       }
-      
+
       &Error("\nThe algorithm (-p=$blastmode) does not support the given input files (use --force to skip this behaviour)...");
 
     }elsif($allowedAlphabet->{$blastmode} eq "a" && $cur_gene_is_valid<1 ){
@@ -2370,7 +2382,7 @@ sub read_details {
       }else{
         print STDERR ("\$ORANGE [WARNING]$NC Found forbidden non-aminoacid character in input fasta file '$file' in gene '$lastgenename'. $blastmode expects aminoacid characters$NC");
       }
-    
+
       if(exists($blastmode_pendant->{$blastmode}) && $restart_counter==0 && $step <2){ # only for step = 0 and step 1 you can do a rerun else the DB are missing
         $blastmode = $blastmode_pendant->{$blastmode};
         print STDERR ("\n!!!\n[WARNING]$NC Switching now to $blastmode and restarting...\n");
@@ -2422,12 +2434,11 @@ sub read_details {
 
 sub Error {
   $debug=1;
-  print STDERR "\n";
-  print STDERR &get_parameter;
+  if($_[0] ne "I need at least two files to compare something!"){print STDERR "\n";print STDERR &get_parameter;}
 
   print STDERR "\n\n$RED"."[Error]$NC $ORANGE ".$_[0]." $NC \n\n";
 
-  print STDERR "(If you cannot solve this error, please send a report to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com including the parameter-vector above or visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Codes for more help. Further more all mails to lechner\@staff.uni-marburg.de are welcome)\n\n\n";
+  if($_[0] ne "I need at least two files to compare something!"){print STDERR "(If you cannot solve this error, please send a report to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com including the parameter-vector above or visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Codes for more help. Further more all mails to lechner\@staff.uni-marburg.de are welcome)\n\n\n";}
 
   &reset_locale();
   if (!$keep && $tmp_path =~ m/\/proteinortho_cache_[^\/]+\d*\/$/ && $step!=1 ){system("rm -r $tmp_path >/dev/null 2>&1");}
@@ -2454,7 +2465,7 @@ sub gff4fasta {
   return $gff;
 }
 
-sub get_po_path { 
+sub get_po_path {
   my @tmppath = fileparse($0); # path to the C++-part of this program
 
   my $uname=`uname -s`;
@@ -2481,9 +2492,9 @@ sub get_po_path {
     }elsif(-x "$binpath/proteinortho_clustering"){
       $tmppath[1]="$binpath";
       if($debug){print STDERR "Detected ".$tmppath[1]."\n";}
-    } 
+    }
   }
-  
+
   if(!-x $tmppath[1]."/proteinortho_clustering"){
     &Error("cannot find proteinortho_clustering in: the current directory '.', ./src/, ./src/BUILD/$uname , /usr/bin, /usr/local/bin, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
     exit 1;
@@ -2504,7 +2515,7 @@ sub get_po_path {
   #   &Error("cannot find proteinortho2tree.pl in $tmppath[1].\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n";
   #   exit 0;
   # }
-  if(!-x $tmppath[1]."/proteinortho_ffadj_mcs.py"){
+  if(!-x $tmppath[1]."/proteinortho_ffadj_mcs.py" && $synteny){
     &Error("cannot find proteinortho_ffadj_mcs.py$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
     exit 1;
   }


=====================================
src/proteinortho_ffadj_mcs.py
=====================================
@@ -1,31 +1,26 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/python3
 
-from sys import stdout, stderr, exit, argv, maxint
+from sys import stderr, exit, argv, maxsize
 from copy import deepcopy
 from bisect import bisect
-from itertools import izip, product
-from os.path import basename, dirname
+from itertools import product
 from random import randint
 from math import ceil
-import logging
-import csv
+import logging as log
+
+ALPHA = 1
 
-DIRECTION_CRICK_STRAND = '+'
-DIRECTION_WATSON_STRAND = '-'
 
 class BothStrands:
+
     def __eq__(self, x):
-        return x == '+' or x =='-' or isinstance(x, BothStrands)
+        return x == '+' or x == '-' or isinstance(x, BothStrands)
+
     def __str__(self):
         return '+/-'
 
-DIRECTION_BOTH_STRANDS = BothStrands()
-
-LOG_FILENAME = 'info.log'
 
 class Run:
-
-    # public variables
     direction = None
     startG1 = None
     startG2 = None
@@ -37,7 +32,7 @@ class Run:
         self.direction = direction
         self.startG1 = startG1
         self.startG2 = startG2
-        self.endG1 = startG1 
+        self.endG1 = startG1
         self.endG2 = startG2
         self.weight = list()
         self.weight.append(weight)
@@ -46,13 +41,12 @@ class Run:
         adjTerm = 0
         if len(self.weight) > 1:
             adjTerm = sum([self.weight[i] * self.weight[i+1] for i in
-                xrange(len(self.weight)-1)])
-        edgeTerm = sum([w **2  for w in self.weight])
-#        edgeTerm = max(self.weight)**2
+                           range(len(self.weight)-1)])
+        edgeTerm = sum([w**2 for w in self.weight])
         return alpha * adjTerm + (1-alpha) * edgeTerm
-    
+
     def extendRun(self, nextG1, nextG2, weight):
-        if self.direction == DIRECTION_CRICK_STRAND:
+        if self.direction == '+':
             self.endG1 = nextG1
             self.endG2 = nextG2
             self.weight.append(weight)
@@ -60,212 +54,193 @@ class Run:
             self.endG1 = nextG1
             self.startG2 = nextG2
             self.weight.append(weight)
-  
+
     def __len__(self):
         return len(self.weight)
 
     def __str__(self):
-        return 'G1:%s-%s G2:%s-%s %s (%.5f)' %(self.startG1, self.endG1,
-                self.startG2, self.endG2, self.direction, self.getWeight(alpha))
+        return 'G1:%s-%s G2:%s-%s %s (%.5f)' % (self.startG1, self.endG1,
+                                                self.startG2, self.endG2,
+                                                self.direction, self.getWeight(ALPHA))
 
-#class DummyRun(Run):
-#    def __init__(self, weights):
-#        self.weight = weights
 
-
-def readDistsAndOrder(data, edgeThreshold):
+def readDistsAndOrder(dist_file, edgeThreshold):
     res = dict()
     hasMultipleChromosomes = False
 
-    g1_chromosomes = dict()
-    g2_chromosomes = dict()
-    chr1 = 0
-    chr2 = 0
-    for line in csv.reader(data, delimiter='\t'):
-        if not res:
-            hasMultipleChromosomes = len(line) == 6
+    g1_chrom_pos, g2_chrom_pos = {}, {}
+    chr1, chr2 = '0', '0'
+    with open(dist_file) as data:
+        for line in data:
+            cells = line.rstrip().split('\t')
+            if not res:
+                hasMultipleChromosomes = len(cells) == 6
 
-        if hasMultipleChromosomes:
-            chr1 = line[0]
-            g1 = int(line[1])
-            chr2 = line[2]
-            g2 = int(line[3])
-            direction = line[4]
-            edgeWeight = float(line[5])
-        else:
-            g1 = int(line[0])
-            g2 = int(line[1])
-            direction = line[2]
-            edgeWeight = float(line[3])
+            if hasMultipleChromosomes:
+                chr1 = cells[0]
+                g1 = int(cells[1])
+                chr2 = cells[2]
+                g2 = int(cells[3])
+                direction = cells[4]
+                edgeWeight = float(cells[5])
+            else:
+                g1 = int(cells[0])
+                g2 = int(cells[1])
+                direction = cells[2]
+                edgeWeight = float(cells[3])
 
-        if edgeWeight < edgeThreshold:
-            continue
+            if edgeWeight < edgeThreshold:
+                continue
 
-        if not g1_chromosomes.has_key(chr1):
-            g1_chromosomes[chr1] = set()
-        if not g2_chromosomes.has_key(chr2):
-            g2_chromosomes[chr2] = set()
+            if chr1 not in g1_chrom_pos:
+                g1_chrom_pos[chr1] = set()
+            if chr2 not in g2_chrom_pos:
+                g2_chrom_pos[chr2] = set()
 
-        g1_chromosomes[chr1].add(g1)
-        g2_chromosomes[chr2].add(g2)
+            g1_chrom_pos[chr1].add(g1)
+            g2_chrom_pos[chr2].add(g2)
 
-        l0 = (chr1, g1)
-        l1 = (chr2, g2)
+            l0 = (chr1, g1)
+            l1 = (chr2, g2)
 
-        if l0 not in res:
-            res[l0] = dict()
-        # append mapping pos in mappedGenome and the weight of the corresponding edge
-        res[l0][l1] = (direction == '1' and DIRECTION_CRICK_STRAND or \
-                DIRECTION_WATSON_STRAND, edgeWeight)
+            if l0 not in res:
+                res[l0] = dict()
+            # append mapping pos in mappedGenome and the weight of the corresponding edge
+            res[l0][l1] = (direction == '1' and '+' or '-', edgeWeight)
 
     # construct genome order
-    tel1, g1 = establish_linear_genome_order(g1_chromosomes)
-    tel2, g2 = establish_linear_genome_order(g2_chromosomes)
+    tel1, g1 = sort_genome(g1_chrom_pos)
+    tel2, g2 = sort_genome(g2_chrom_pos)
 
     # add telomeres
     for t1, t2 in product(tel1, tel2):
-        if not res.has_key(t1):
+        if t1 not in res:
             res[t1] = dict()
-        res[t1][t2] = (DIRECTION_BOTH_STRANDS, 1)
-
-#    res[maxint] = dict([
-#        (maxint, (DIRECTION_WATSON_STRAND, 1)), 
-#        (0,      (DIRECTION_WATSON_STRAND, 1)), 
-#        (maxint, (DIRECTION_CRICK_STRAND, 1)),
-#        (0,      (DIRECTION_CRICK_STRAND, 1))])
-#    res[maxint] = dict([
-#        (maxint, (DIRECTION_WATSON_STRAND, 1)), 
-#        (0,      (DIRECTION_WATSON_STRAND, 1)),
-#        (maxint, (DIRECTION_CRICK_STRAND, 1)), 
-#        (0,      (DIRECTION_CRICK_STRAND, 1))])
+        res[t1][t2] = (BothStrands(), 1)
 
     return hasMultipleChromosomes, g1, g2, res
 
-def establish_linear_genome_order(chromosomes):
+
+def sort_genome(chrom_pos):
     g = list()
     telomeres = set()
-    for k in sorted(chromosomes.keys()):
+    for k in sorted(chrom_pos.keys()):
         g.append((k, -1))
         telomeres.add((k, -1))
-        g.extend([(k, i) for i in sorted(chromosomes[k])])
-        g.append((k, maxint))
-        telomeres.add((k, maxint))
+        g.extend([(k, i) for i in sorted(chrom_pos[k])])
+        g.append((k, maxsize))
+        telomeres.add((k, maxsize))
     return telomeres, g
-        
+
+
 def insertIntoRunList(runs, runList):
-    keys = map(lambda x: x.getWeight(alpha), runList)
+    keys = [x.getWeight(ALPHA) for x in runList]
     for run in runs:
-        i = bisect(keys, run.getWeight(alpha))
-        keys.insert(i, run.getWeight(alpha))
+        i = bisect(keys, run.getWeight(ALPHA))
+        keys.insert(i, run.getWeight(ALPHA))
         runList.insert(i, run)
 
-def checkMatching(g1, g2, g1_runs, g2_runs, runs, dist):
-    g1pos = dict(izip(g1, xrange(len(g1))))
-    g2pos = dict(izip(g2, xrange(len(g2))))
 
+def checkMatching(g1, g2, g1_runs, g2_runs, runs, dist):
+    g1pos = dict(zip(g1, range(len(g1))))
+    g2pos = dict(zip(g2, range(len(g2))))
 
     if len(g1) != len(g2):
-        logging.error(('G1 and G2 have unequal length: len(G1) = %s, len(G2)' + \
-                ' %s') %(len(g1), len(g2)))
+        log.error('unequal length: |G1| = %d, |G2| = %d' % (len(g1), len(g2)))
     if len(g1) != len(g1_runs) or len(g2) != len(g2_runs):
-        logging.error(('Annotation vector length doesn\'t match with genome ' + \
-                'length: len(G1) = %s, len(g1_runs) = %s, len(G2) = %s, len(' + \
-                'g2_runs) = %s') %(len(g1), len(g1_runs), len(g2),
-                    len(g2_runs)))
+        log.error(('Annotation vector length does not match with genome ' +
+                   'length: len(G1) = %s, len(g1_runs) = %s, len(G2) = %s, len(' +
+                   'g2_runs) = %s') % (len(g1), len(g1_runs), len(g2), len(g2_runs)))
 
     all_included = set()
-    r_counter = 0 
+    r_counter = 0
     prev_run = None
     c_adj = 0
-    for i in xrange(len(g1)):
+    for i in range(len(g1)):
         if not g1_runs[i]:
-            logging.error('Gene %s is not included in any run' %g1[i])
+            log.error('Gene %s is not included in any run' % g1[i])
             continue
         if len(g1_runs[i]) > 1:
-            logging.error('Gene %s is included in more than one run: %s' %(g1[i], 
-                ', '.join(map(str, g1_runs[i]))))
+            log.error('Gene %s is included in more than one run: %s' % (g1[i],
+                      ', '.join(map(str, g1_runs[i]))))
             continue
-       
+
         r = list(g1_runs[i])[0]
 
         if prev_run != r:
             c_adj += len(r.weight)-1
             if r not in runs:
-                logging.error('Run %s not included in run list.' %r)
+                log.error('Run %s not included in run list.' % r)
             if r in all_included:
-                logging.error(('Run %s occurs twice in G1. Current gene ' + \
-                        'position: %s') % (r, g1[i]))
+                log.error(('Run %s occurs twice in G1. Current gene position: %s') % (r, g1[i]))
             r_counter += len(r.weight)
-            prev_run = r 
+            prev_run = r
 
         all_included.add(r)
         k = i-g1pos[r.startG1]
-        if r.direction == DIRECTION_CRICK_STRAND:
+        if r.direction == '+':
             g2j = g2[g2pos[r.startG2] + k]
         else:
             g2j = g2[g2pos[r.endG2] - k]
         eWgt = dist[g1[i]][g2j][1]
 
         if r.weight[k] != eWgt:
-            logging.error(('Edge weight of %s-%s differs in run %s, should be' + \
-                ' %.6f but is %.6f') %(g1[i], g2j, r, eWgt, r.weight[k]))
-  
+            log.error('Edge weight of %s-%s differs in run %s, should be %.6f but is %.6f' %
+                      (g1[i], g2j, r, eWgt, r.weight[k]))
+
     missing_runs = all_included.symmetric_difference(runs)
     if missing_runs:
-        logging.error(('Additional runs in runslist that are not part in the' + \
-                ' matching: %s') %(map(str, missing_runs)))
-        
-    logging.info('Number of adjacencies is %s in matching of size %s.' %(c_adj,
-        len(g1)))
-   
+        log.error('Additional runs in runslist that are not part in the matching: %s' %
+                  (list(map(str, missing_runs))))
+
+    log.info('Number of adjacencies is %s in matching of size %s.' % (c_adj, len(g1)))
+
     if r_counter != len(g1):
-        logging.error(('Sum of run lengths does not equal matching size! Sum ' + \
-                'of run lengths: %s, matching size: %s') % (r_counter, len(g1)))
+        log.error(('Sum of run lengths does not equal matching size! Sum ' +
+                   'of run lengths: %s, matching size: %s') % (r_counter, len(g1)))
 
-    for j in xrange(len(g2)):
+    for j in range(len(g2)):
         if not g2_runs[j]:
-            logging.error('Gene %s is not included in any run' %g2[j])
+            log.error('Gene %s is not included in any run' % g2[j])
         if len(g2_runs[j]) > 1:
-            logging.error('Gene %s is included in more than one run: %s' %(g2[j], 
-                ', '.join(map(str, g2_runs[j]))))
+            log.error('Gene %s is included in more than one run: %s' % (g2[j],
+                      ', '.join(map(str, g2_runs[j]))))
         if g2_runs[j].difference(all_included):
-            logging.error('G2 differs in runs from G1 on position %s: %s' %(g2[j], 
-                ', '.join(map(str,g2_runs[j].difference(all_included)))))
-    
+            log.error('G2 differs in runs from G1 on position %s: %s' % (g2[j],
+                      ', '.join(map(str, g2_runs[j].difference(all_included)))))
+
     for r in runs:
         if r.startG1 not in g1pos or r.endG1 not in g1pos or r.startG2 not in \
                 g2pos or r.endG2 not in g2pos:
-            logging.error(('Positions of run %s can not be mapped back to the' + \
-                    ' genomes.') %r)
+            log.error('Positions of run %s can not be mapped back to the genomes.' % r)
             continue
         if len(g1) <= g1pos[r.startG1] or len(g1) <= g1pos[r.endG1] or \
                 len(g2) <= g2pos[r.startG2] or len(g2) <= g2pos[r.endG2]:
-            logging.error(('Positions of run %s exceed borders of the ' + \
-                    'genomes') % r)
+            log.error('Positions of run %s exceed borders of the genomes' % r)
             continue
         if g1[g1pos[r.startG1]] != r.startG1 or g2[g2pos[r.startG2]] != \
                 r.startG2:
-            logging.error(('Start of run %s is not coherent with genome ' + \
+            log.error(('Start of run %s is not coherent with genome ' + \
                     'position on %s (G1) or %s (G2)') %(r, g1[g1pos[r.startG1]],
                         g2[g2pos[r.startG2]]))
         if g1[g1pos[r.endG1]] != r.endG1 or g2[g2pos[r.endG2]] != r.endG2:
-            logging.error(('End of run %s is not coherent with genome ' + \
+            log.error(('End of run %s is not coherent with genome ' + \
                     'position on %s (G1) or %s (G2)') %(r, g1[g1pos[r.endG1]],
                         g2[g2pos[r.endG2]]))
         if g1pos[r.endG1] - g1pos[r.startG1] != g2pos[r.endG2] - \
                 g2pos[r.startG2] or g1pos[r.endG1] - g1pos[r.startG1] < 0:
-            logging.error(('Length of run %s is erroneous: %s (on G1), %s ' + \
+            log.error(('Length of run %s is erroneous: %s (on G1), %s ' + \
                     '(on G2)') %(r, g1pos[r.endG1] - g1pos[r.startG1],
                         g2pos[r.endG2] - g2pos[r.startG2]))
         if len(r.weight) != g1pos[r.endG1] - g1pos[r.startG1] + 1:
-            logging.error(('Number of weights does not comply with run length. ' + \
+            log.error(('Number of weights does not comply with run length. ' + \
                     'Weights: %s, run length: %s, run: %s') %(len(r.weight),
-                        g1pos[r.endG1] - g1pos[r.startG1], r)) 
+                        g1pos[r.endG1] - g1pos[r.startG1], r))
 
-        g1_chromosomes = set(map(lambda x: x[0], g1[g1pos[r.startG1]:g1pos[r.endG1]+1]))
-        g2_chromosomes = set(map(lambda x: x[0], g2[g2pos[r.startG2]:g2pos[r.endG2]+1]))
+        g1_chromosomes = set([x[0] for x in g1[g1pos[r.startG1]:g1pos[r.endG1]+1]])
+        g2_chromosomes = set([x[0] for x in g2[g2pos[r.startG2]:g2pos[r.endG2]+1]])
         if len(g1_chromosomes) != 1 and len(g2_chromosomes) != 1:
-            logging.error(('Number of chromosomes on G1 (#chrs: %s) or G2 ' + \
+            log.error(('Number of chromosomes on G1 (#chrs: %s) or G2 ' + \
                     '(#chrs: %s) in run %s is not 1 (Meaning that possibly' + \
                     ' the run extends over two or more chromosomes, which ' + \
                     'shouldn\'t be allowed).') %(len(g1_chromosomes),
@@ -273,15 +248,15 @@ def checkMatching(g1, g2, g1_runs, g2_runs, runs, dist):
 
     # are all runs merged that can be merged?
     run_ends = dict()
-    for r in runs: 
-        if r.direction == DIRECTION_CRICK_STRAND:
+    for r in runs:
+        if r.direction == '+':
             run_ends[r.startG1] = (r.direction, r.startG2)
             run_ends[r.endG1] = (r.direction, r.endG2)
         else:
             run_ends[r.startG1] = (r.direction, r.endG2)
             run_ends[r.endG1] = (r.direction, r.startG2)
 
-    for i in xrange(len(g1)-1):
+    for i in range(len(g1)-1):
         g1i = g1[i]
         g1i2 = g1[i+1]
         if g1i in run_ends and g1i2 in run_ends and run_ends[g1i][0] == \
@@ -289,134 +264,131 @@ def checkMatching(g1, g2, g1_runs, g2_runs, runs, dist):
             direction = run_ends[g1i][0]
             g2i = run_ends[g1i][1]
             g2i2 = run_ends[g1i2][1]
-            if direction == DIRECTION_CRICK_STRAND and g2pos[g2i] == g2pos[g2i2]-1:
-                logging.error('Runs %s and %s could be merged, but are not!' % (map(str, g1_runs[i])[0], map(str, g1_runs[i+1])[0])) 
-            elif direction == DIRECTION_WATSON_STRAND and g2pos[g2i] == g2pos[g2i2]+1:
-                logging.error('Runs %s and %s could be merged, but are not!' % (map(str, g1_runs[i])[0], map(str, g1_runs[i+1])[0]))
- 
-def getAllRuns(g1, g2, d):
+            if direction == '+' and g2pos[g2i] == g2pos[g2i2]-1:
+                log.error('Runs %s and %s could be merged, but are not!' %
+                          (list(map(str, g1_runs[i]))[0], list(map(str, g1_runs[i+1]))[0]))
+            elif direction == '-' and g2pos[g2i] == g2pos[g2i2]+1:
+                log.error('Runs %s and %s could be merged, but are not!' %
+                          (list(map(str, g1_runs[i]))[0], list(map(str, g1_runs[i+1]))[0]))
 
-    g2pos = dict(izip(g2, xrange(len(g2))))
 
+def getAllRuns(g1, g2, d):
+    g2pos = dict(zip(g2, range(len(g2))))
     g1_runs = [set() for _ in g1]
     g2_runs = [set() for _ in g2]
-
     activeRuns = list()
-    reportedRuns= list()
-
+    reportedRuns = list()
 
-    for i in xrange(len(g1)):
-
-        curPos = g1[i] 
-        
+    for i in range(len(g1)):
+        curPos = g1[i]
         newRunList = list()
         forbiddenRunStarts = list()
-        
+
         # check if link exists, otherwise terminate all runs
         e = curPos in d
         # iterate over all runs
         for r in activeRuns:
-            jEnd= g2pos[r.endG2]
+            jEnd = g2pos[r.endG2]
             jStart = g2pos[r.startG2]
             if r.startG1[0] != curPos[0]:
-                # run could not be extended 
-                logging.info(('Terminate and report run %s, because %s is on a' + \
-                        ' different chromosome.') %(r, curPos))
+                # run could not be extended
+                log.info('terminating run %s, continue with %s.' % (r, curPos))
                 reportedRuns.append(r)
                 continue
             # extend to the right
-            if e and r.direction == DIRECTION_CRICK_STRAND and len(g2) > jEnd + 1 \
-                and g2[jEnd+1] in d[curPos] and d[curPos][g2[jEnd+1]][0] == \
-                DIRECTION_CRICK_STRAND and g2[jEnd+1][0] == r.endG2[0]:
+            if e and r.direction == '+' and len(g2) > jEnd + 1 and \
+                    g2[jEnd+1] in d[curPos] and d[curPos][g2[jEnd+1]][0] == \
+                    '+' and g2[jEnd+1][0] == r.endG2[0]:
                 g2_gene_r = g2[jEnd+1]
                 r.extendRun(curPos, g2_gene_r, d[curPos][g2_gene_r][1])
                 newRunList.append(r)
-                forbiddenRunStarts.append((DIRECTION_CRICK_STRAND, g2_gene_r))
+                forbiddenRunStarts.append(('+', g2_gene_r))
                 g1_runs[i].add(r)
                 g2_runs[jEnd+1].add(r)
-                logging.debug('Extended run %s to the right' %r)
+                log.debug('Extended run %s to the right' % r)
 
             # extend to the left
-            elif e and r.direction == DIRECTION_WATSON_STRAND and jStart > 0 and \
+            elif e and r.direction == '-' and jStart > 0 and \
                     g2[jStart-1] in d[curPos] and d[curPos][g2[jStart-1]][0] == \
-                    DIRECTION_WATSON_STRAND and g2[jStart-1][0] == r.startG2[0]:
+                    '-' and g2[jStart-1][0] == r.startG2[0]:
                 g2_gene_l = g2[jStart-1]
                 r.extendRun(curPos, g2_gene_l, d[curPos][g2_gene_l][1])
                 newRunList.append(r)
                 g1_runs[i].add(r)
                 g2_runs[jStart-1].add(r)
-                forbiddenRunStarts.append((DIRECTION_WATSON_STRAND, g2_gene_l))
-                logging.debug('Extended run %s to the left' %r)
+                forbiddenRunStarts.append(('-', g2_gene_l))
+                log.debug('Extended run %s to the left' % r)
             else:
-                # run could not be extended 
-                logging.info(('Terminate and report run %s, because %s has '
-                    + 'no further consecutive edge.') %(r, curPos))
+                # run could not be extended
+                log.info(('Terminate and report run %s, because %s has '
+                          'no further consecutive edge.') % (r, curPos))
                 reportedRuns.append(r)
 
         # if no edge exists, nothing has to be done...
         if e:
-            for (g2_gene, (direction, weight)) in d[curPos].items():
+            for (g2_gene, (direction, weight)) in list(d[curPos].items()):
                 if (direction, g2_gene) not in forbiddenRunStarts:
-                    j = g2pos[g2_gene] 
+                    j = g2pos[g2_gene]
                     if isinstance(direction, BothStrands):
-                        r = Run(curPos, g2_gene, weight, DIRECTION_CRICK_STRAND)
+                        r = Run(curPos, g2_gene, weight, '+')
                         newRunList.append(r)
                         g1_runs[i].add(r)
                         g2_runs[j].add(r)
-                        logging.debug(('Start new (%s) run %s') %(direction, r))
-                        r = Run(curPos, g2_gene, weight, DIRECTION_WATSON_STRAND)
+                        log.debug(('Start new (%s) run %s') % (direction, r))
+                        r = Run(curPos, g2_gene, weight, '-')
                         newRunList.append(r)
                         g1_runs[i].add(r)
                         g2_runs[j].add(r)
-                        logging.debug(('Start new (%s) run %s') %(direction, r))
+                        log.debug(('Start new (%s) run %s') % (direction, r))
                     else:
                         r = Run(curPos, g2_gene, weight, direction)
                         newRunList.append(r)
                         g1_runs[i].add(r)
                         g2_runs[j].add(r)
-                        logging.debug(('Start new (%s) run %s') %(direction, r))
+                        log.debug(('Start new (%s) run %s') % (direction, r))
         activeRuns = newRunList
     reportedRuns.extend(activeRuns)
     return (g1_runs, g2_runs, reportedRuns)
 
+
 def replaceByNew(g1_runs, g2_runs, i, j, r_old, r_new):
     while r_old in g1_runs[i]:
         g1_runs[i].remove(r_old)
         g1_runs[i].add(r_new)
         g2_runs[j].remove(r_old)
         g2_runs[j].add(r_new)
-        i+=1
-        j+=1
+        i += 1
+        j += 1
         if len(g1_runs) <= i or len(g2_runs) <= j:
             break
 
+
 def doMatching(g1, g2, g1_runs, g2_runs, m, runList):
-    g1pos = dict(izip(g1, xrange(len(g1))))
-    g2pos = dict(izip(g2, xrange(len(g2))))
- 
+    g1pos = dict(zip(g1, range(len(g1))))
+    g2pos = dict(zip(g2, range(len(g2))))
     newRuns = set()
 
-    for k in xrange(g1pos[m.endG1] - g1pos[m.startG1] + 1):
+    for k in range(g1pos[m.endG1] - g1pos[m.startG1] + 1):
         i = g1pos[m.startG1] + k
         j = g2pos[m.startG2] + k
 
         for r in set(g1_runs[i]):
             if r == m:
-                continue 
+                continue
             g1_runs[i].remove(r)
 
             if r in runList:
                 runList.remove(r)
-            
+
             if g1pos[r.startG1] < i:
                 overlap = g1pos[r.endG1] - i
-                logging.info(('Run %s overlaps with selected run %s by %s ' + \
-                        'at position G1:%s.') %(r, m, overlap+1, g1[i]))
+                log.info(('Run %s overlaps with selected run %s by %s ' +
+                          'at position G1:%s.') % (r, m, overlap+1, g1[i]))
                 r_new = deepcopy(r)
                 r_new.endG1 = g1[i-1]
-                if r.direction == DIRECTION_CRICK_STRAND:
+                if r.direction == '+':
                     # check weight
-                    r_new.endG2 = g2[g2pos[r.endG2] - overlap -1]
+                    r_new.endG2 = g2[g2pos[r.endG2] - overlap - 1]
                     r_new.weight = r.weight[:-overlap-1]
                     r.weight = r.weight[-overlap-1:]
                     r.startG2 = g2[g2pos[r.endG2]-overlap]
@@ -428,29 +400,29 @@ def doMatching(g1, g2, g1_runs, g2_runs, m, runList):
                     r.endG2 = g2[g2pos[r.startG2] + overlap]
                     g2_runs[g2pos[r.endG2]].remove(r)
                 r.startG1 = g1[i]
-                logging.info('Divided overlapping run in %s and %s' %(r_new, r))
+                log.info('Divided overlapping run in %s and %s' % (r_new, r))
                 # do you see that r.startG2 is already at the right position?
                 replaceByNew(g1_runs, g2_runs, g1pos[r_new.startG1],
-                        g2pos[r_new.startG2], r, r_new)
+                             g2pos[r_new.startG2], r, r_new)
                 newRuns.add(r_new)
-            
+
             elif g1pos[r.startG1] == i:
-                if r.direction == DIRECTION_CRICK_STRAND:
+                if r.direction == '+':
                     g2_runs[g2pos[r.startG2]].remove(r)
                 else:
                     g2_runs[g2pos[r.endG2]].remove(r)
             if len(g1) > i+1 and i < g1pos[r.endG1]:
                 # run start cannot be larger than i
-                logging.info(('Run %s interfers with current run %s at ' + \
-                        'position G1:%s. Shifting.') %(r, m, g1[i]))
+                log.info(('Run %s interfers with current run %s at ' +
+                          'position G1:%s. Shifting.') % (r, m, g1[i]))
                 r.startG1 = g1[i+1]
                 del r.weight[0]
-                if r.direction == DIRECTION_CRICK_STRAND:
+                if r.direction == '+':
                     r.startG2 = g2[g2pos[r.startG2]+1]
                 else:
                     r.endG2 = g2[g2pos[r.endG2]-1]
 
-                logging.info('Shifted run is now located at %s' %r)
+                log.info('Shifted run is now located at %s' % r)
                 newRuns.add(r)
             elif r in newRuns:
                 newRuns.remove(r)
@@ -459,99 +431,93 @@ def doMatching(g1, g2, g1_runs, g2_runs, m, runList):
             if r == m:
                 continue
             g2_runs[j].remove(r)
- 
+
             if r in runList:
                 runList.remove(r)
-            
+
             if g2pos[r.startG2] < j:
                 overlap = g2pos[r.endG2] - j
-                logging.info(('Run %s overlaps with selected run %s by %s ' + \
-                        'at position G2:%s.') %(r, m, overlap+1, g2[j]))
+                log.info(('Run %s overlaps with selected run %s by %s ' +
+                          'at position G2:%s.') % (r, m, overlap+1, g2[j]))
                 r_new = deepcopy(r)
                 r_new.endG2 = g2[j-1]
-                if r.direction == DIRECTION_CRICK_STRAND:
-                    r_new.endG1 = g1[g1pos[r.endG1]-overlap -1]
+                if r.direction == '+':
+                    r_new.endG1 = g1[g1pos[r.endG1]-overlap-1]
                     r_new.weight = r.weight[:-overlap-1]
                     r.weight = r.weight[-overlap-1:]
                     r.startG1 = g1[g1pos[r.endG1]-overlap]
                     g1_runs[g1pos[r.startG1]].remove(r)
                 else:
-                    r_new.startG1 = g1[g1pos[r.startG1]+overlap+1] 
+                    r_new.startG1 = g1[g1pos[r.startG1]+overlap+1]
                     r_new.weight = r.weight[overlap+1:]
                     r.weight = r.weight[:overlap+1]
                     r.endG1 = g1[g1pos[r.startG1]+overlap]
                     g1_runs[g1pos[r.endG1]].remove(r)
                 r.startG2 = g2[j]
-                logging.info('Divided overlapping run in %s and %s' %(r_new, r))
-#                # do you see that r.startG1 is already at the right position?
-#                if r.direction == DIRECTION_CRICK_STRAND:
-#                    g1_runs[g1pos[r.endG1]].remove(r)
-#                else:
-#                    g1_runs[g1pos[r.startG1]].remove(r)
+                log.info('Divided overlapping run in %s and %s' % (r_new, r))
                 replaceByNew(g1_runs, g2_runs, g1pos[r_new.startG1],
-                        g2pos[r_new.startG2], r, r_new)
+                             g2pos[r_new.startG2], r, r_new)
                 newRuns.add(r_new)
 
             elif g2pos[r.startG2] == j:
-                if r.direction == DIRECTION_CRICK_STRAND:
+                if r.direction == '+':
                     g1_runs[g1pos[r.startG1]].remove(r)
                 else:
                     g1_runs[g1pos[r.endG1]].remove(r)
 
             if len(g2) > j+1 and j < g2pos[r.endG2]:
                 # run start cannot be larger than j
-                logging.info(('Run %s interfers with current run %s at ' + \
-                        'position G2:%s. Shifting.') %(r, m, g2[j]))
+                log.info(('Run %s interfers with current run %s at ' +
+                          'position G2:%s. Shifting.') % (r, m, g2[j]))
                 r.startG2 = g2[j+1]
-                if r.direction == DIRECTION_CRICK_STRAND:
+                if r.direction == '+':
                     r.startG1 = g1[g1pos[r.startG1]+1]
                     del r.weight[0]
                 else:
                     r.endG1 = g1[g1pos[r.endG1]-1]
                     del r.weight[-1]
-                logging.info('Shifted run is now located at %s' %r)
+                log.info('Shifted run is now located at %s' % r)
                 newRuns.add(r)
             elif r in newRuns:
                 newRuns.remove(r)
     insertIntoRunList(newRuns, runList)
 
+
 def mergeRuns(mod_g1, g1, g2, g1_runs, g2_runs, runList, alreadyMatched):
-    g1pos = dict(izip(g1, xrange(len(g1))))
-    g2pos = dict(izip(g2, xrange(len(g2))))
+    g1pos = dict(zip(g1, range(len(g1))))
+    g2pos = dict(zip(g2, range(len(g2))))
 
     newRuns = set()
-    wSrt = lambda x: x.getWeight(alpha)
     mod_g1 = list(mod_g1)
-    for x in xrange(len(mod_g1)):
+    for x in range(len(mod_g1)):
         g1i = mod_g1[x]
         i = g1pos[g1i]
         if len(g1) < i+2:
             continue
 
-
         # To understand this piece of code, one observation is important:
-        # If r1 or r2 is already matched, then there exist only one combination
+        # If r1 or r2 is already matched, then there exists only one combination
         # of possible merges. If r1 and r2 are both unmatched, several merges
-        # are possible and all should be done. 
+        # are possible and all should be done.
         # After each merge between a matched and unmatched run, the newly
         # merged run must be completely matched, before further modification
-        # points (mod_g1) can be processed. 
+        # points (mod_g1) can be processed.
 
         for r1, r2 in product(sorted(g1_runs[i].difference(g1_runs[i+1]),
-            key=wSrt, reverse=True),
-                sorted(g1_runs[i+1].difference(g1_runs[i]), key=wSrt,
-                    reverse=True)):
+                                     key=lambda x: x.getWeight(ALPHA), reverse=True),
+                              sorted(g1_runs[i+1].difference(g1_runs[i]),
+                                     key=lambda x: x.getWeight(ALPHA), reverse=True)):
             if r1.endG1 == g1[i] and r2.startG1 == g1[i+1] and \
                     r1.direction == r2.direction and \
                     r1.endG1[0] == r2.startG1[0] and \
                     r1.endG2[0] == r2.startG2[0] and \
-                    ((r1.direction == DIRECTION_CRICK_STRAND and \
-                    g2pos[r1.endG2] == g2pos[r2.startG2] -1) or \
-                    (r1.direction == DIRECTION_WATSON_STRAND and \
-                    g2pos[r2.endG2] == g2pos[r1.startG2] -1)):
+                    ((r1.direction == '+' and
+                      g2pos[r1.endG2] == g2pos[r2.startG2] - 1) or
+                     (r1.direction == '-' and
+                      g2pos[r2.endG2] == g2pos[r1.startG2] - 1)):
 
-                logging.info('Merge runs %s and %s.' %(r1, r2))
-                if r1 in runList: 
+                log.info('Merge runs %s and %s.' % (r1, r2))
+                if r1 in runList:
                     runList.remove(r1)
                 if r2 in runList:
                     runList.remove(r2)
@@ -561,13 +527,13 @@ def mergeRuns(mod_g1, g1, g2, g1_runs, g2_runs, runList, alreadyMatched):
 
                 r2.startG1 = r1.startG1
                 r2.weight = r1.weight + r2.weight
-                if r1.direction == DIRECTION_CRICK_STRAND:
+                if r1.direction == '+':
                     r2.startG2 = r1.startG2
                 else:
                     r2.endG2 = r1.endG2
-                logging.info('Merged run is %s' %r2)  
+                log.info('Merged run is %s' % r2)
                 replaceByNew(g1_runs, g2_runs, g1pos[r1.startG1],
-                        g2pos[r1.startG2], r1, r2)
+                             g2pos[r1.startG2], r1, r2)
                 if (r2 in alreadyMatched) ^ (r1 in alreadyMatched):
                     if r1 in alreadyMatched:
                         alreadyMatched.remove(r1)
@@ -585,6 +551,7 @@ def mergeRuns(mod_g1, g1, g2, g1_runs, g2_runs, runList, alreadyMatched):
     insertIntoRunList(newRuns, runList)
     return None, []
 
+
 def removeSingleGenes(genome, genome_runs):
     del_res = set()
     mod_res = set()
@@ -596,37 +563,39 @@ def removeSingleGenes(genome, genome_runs):
             del genome[i]
             del genome_runs[i]
         else:
-            i+=1
+            i += 1
     return del_res, mod_res
 
+
 def findRandomRunSequence(g1, g2, dists, topXperCent):
     g2dists = dict()
-    for g1i, x in dists.items():
-        for g2j, d in x.items():
+    for g1i, x in list(dists.items()):
+        for g2j, d in list(x.items()):
             if g2j not in g2dists:
                 g2dists[g2j] = dict()
             g2dists[g2j][g1i] = d
 
     # copy g1, g2 and dists map, because we'll modify it. Also remove all genes
     # that do not contain edges.
-    g1 = [x for x in g1 if dists.has_key(x) and len(dists[x])]
-    g2 = [x for x in g2 if g2dists.has_key(x) and len(g2dists[x])]
+    g1 = [x for x in g1 if x in dists and len(dists[x])]
+    g2 = [x for x in g2 if x in g2dists and len(g2dists[x])]
 
-    g1pos = dict(izip(g1, xrange(len(g1))))
+    g1pos = dict(zip(g1, range(len(g1))))
 
     g1_runs, g2_runs, runs = getAllRuns(g1, g2, dists)
-    logging.info('Found %s runs.' %len(runs))
-    # sort 
-    runList = sorted(runs, key=lambda x: x.getWeight(alpha))
-    
+    log.info('Found %s runs.' % len(runs))
+    # sort
+    runList = sorted(runs, key=lambda x: x.getWeight(ALPHA))
+
     res = set()
     while runList:
-        noOfAdjacencies = len(filter(lambda x: x.getWeight(alpha) and x.getWeight(alpha) or 0, runList))
+        noOfAdjacencies = len([x for x in runList if x.getWeight(ALPHA) and x.getWeight(ALPHA) or 0])
         if noOfAdjacencies:
             randPos = randint(1, ceil(noOfAdjacencies * topXperCent))
         else:
             randPos = randint(1, ceil(len(runList) * topXperCent))
-        logging.info('From %s, select randomly among top %s run %s' %(len(runList), int(ceil((noOfAdjacencies or len(runList))* topXperCent)), runList[-randPos]))
+        log.info('From %s, select randomly among top %s run %s' %
+                 (len(runList), int(ceil((noOfAdjacencies or len(runList)) * topXperCent)), runList[-randPos]))
         mx = runList.pop(-randPos)
         mod_g1 = set()
         while mx:
@@ -634,46 +603,46 @@ def findRandomRunSequence(g1, g2, dists, topXperCent):
             # update run list
             doMatching(g1, g2, g1_runs, g2_runs, mx, runList)
             del_g1, new_mod_g1 = removeSingleGenes(g1, g1_runs)
-            if del_g1: 
-                logging.info('Zombie genes removed from G1: %s' %', '.join(map(str, del_g1)))
+            if del_g1:
+                log.info('Zombie genes removed from G1: %s' % ', '.join(map(str, del_g1)))
                 # it can happen that a gene in mod_g1 has already been deleted
                 # before being processed. This happens if there is a merge between
                 # a matched and unmatched run. Then some genes remain unprocessed
                 # while the merged run is re-matched. In this process, new genes
                 # can be deleted. If one of the genes happens to be in mod_g1, it
-                # should be deleted. 
+                # should be deleted.
                 for g in del_g1.intersection(mod_g1):
                     mod_g1.remove(g)
 
-                g1pos = dict(izip(g1, xrange(len(g1))))
+                g1pos = dict(zip(g1, range(len(g1))))
             # add new modification points
             mod_g1.update(new_mod_g1)
 
             del_g2, mod_g2 = removeSingleGenes(g2, g2_runs)
             if del_g2:
-                logging.info('Zombie genes removed from G2: %s' %', '.join(map(str, del_g2)))
+                log.info('Zombie genes removed from G2: %s' % ', '.join(map(str, del_g2)))
                 for g2j in mod_g2:
-                    for g1i, (d, _) in g2dists[g2j].items():
+                    for g1i, (d, _) in list(g2dists[g2j].items()):
                         if g1i in g1:
-                            if d == DIRECTION_CRICK_STRAND:
+                            if d == '+':
                                 mod_g1.add(g1i)
-                            # what, if d == DIRECTION_BOTH_STRANDS? Then, both neighbors have to be added...
-                            if d == DIRECTION_WATSON_STRAND:
+                            # d == DIRECTION_BOTH_STRANDS? both neighbors have to be added...
+                            if d == '-':
                                 mod_g1.add(g1[g1pos[g1i]-1])
             # merge runs
             mx, mod_g1 = mergeRuns(mod_g1, g1, g2, g1_runs, g2_runs,
-                    runList, res)
-  
-    if res: 
-        logging.info('Matching finished. Longest run size is %s.' %(max(map(len,
-            res))))
+                                   runList, res)
+
+    if res:
+        log.info('Matching finished. Longest run size is %s.' % (max(list(map(len, res)))))
     else:
-        logging.info('Matching finished, but no runs found. Empty input?')
+        log.info('Matching finished, but no runs found. Empty input?')
 
     return (g1, g2, g1_runs, g2_runs, res)
 
+
 def repeatMatching(g1, g2, g1_mod, g2_mod, g1_runs, g2_runs, dists, repMatching,
-        minCsSize, topXperCent):
+                   minCsSize, topXperCent):
 
     g1_mod_res = g1_mod
     g2_mod_res = g2_mod
@@ -681,24 +650,23 @@ def repeatMatching(g1, g2, g1_mod, g2_mod, g1_runs, g2_runs, dists, repMatching,
     g2_runs_res = g2_runs
     selectedRuns_res = list()
 
-    g1pos = dict(izip(g1_mod, xrange(len(g1_mod))))
-    g2pos = dict(izip(g2_mod, xrange(len(g2_mod))))
-
+    g1pos = dict(zip(g1_mod, range(len(g1_mod))))
+    g2pos = dict(zip(g2_mod, range(len(g2_mod))))
 
     noReps = repMatching
 
     while repMatching:
-        for i in xrange(len(g1_runs)):
+        for i in range(len(g1_runs)):
             run_set = g1_runs[i]
             if len(run_set) != 1:
-                logging.error(('Expected run, set length of 1, but was told' + \
-                        ' different: %s.') %(', '.join(map(str, run_set))))
-            run = run_set.__iter__().next() 
+                log.error('Expected run, set length of 1, but was told different: %s.' %
+                          ', '.join(map(str, run_set)))
+            run = next(run_set.__iter__())
 
             g1i = g1_mod[i]
 
             j = i-g1pos[run.startG1]
-            if run.direction == DIRECTION_CRICK_STRAND:
+            if run.direction == '+':
                 g2j = g2_mod[g2pos[run.startG2] + j]
             else:
                 g2j = g2_mod[g2pos[run.endG2] - j]
@@ -708,190 +676,160 @@ def repeatMatching(g1, g2, g1_mod, g2_mod, g1_runs, g2_runs, dists, repMatching,
                 del dists[g1i]
 
         if not dists:
-            logging.info(('Removed all edges in the input graph. Stopping ' + \
-                    'iteration %s.') % (noReps-repMatching+2))
+            log.info('Removed all edges in the input graph. Stopping iteration %s.' %
+                     (noReps-repMatching+2))
             break
 
         g1_mod, g2_mod, g1_runs, g2_runs, selectedRuns = findRandomRunSequence(g1, g2, dists, topXperCent)
         checkMatching(g1_mod, g2_mod, g1_runs, g2_runs, selectedRuns, dists)
 
-        logging.info(('Obtained %s adjacencies in matching of size %s from ' + \
-                'iteration %s.') %(len(g1_mod) - len(selectedRuns),
-                    len(g1_mod), noReps-repMatching+2))
+        log.info('Obtained %s adjacencies in matching of size %s from iteration %s.' %
+                 (len(g1_mod) - len(selectedRuns), len(g1_mod), noReps-repMatching+2))
 
         # remove runs that fall below min length of minCsSize
-        ff = lambda x: len(x.__iter__().next()) >= minCsSize
-        g1_mod = [g1_mod[i] for i in xrange(len(g1_mod)) if ff(g1_runs[i])]
-        g2_mod = [g2_mod[i] for i in xrange(len(g2_mod)) if ff(g2_runs[i])]
-        g1_runs = filter(ff, g1_runs)
-        g2_runs = filter(ff, g2_runs)
+        ff = lambda x: len(next(x.__iter__())) >= minCsSize
+        g1_mod = [g1_mod[i] for i in range(len(g1_mod)) if ff(g1_runs[i])]
+        g2_mod = [g2_mod[i] for i in range(len(g2_mod)) if ff(g2_runs[i])]
+        g1_runs = list(filter(ff, g1_runs))
+        g2_runs = list(filter(ff, g2_runs))
         selectedRuns = set([s for s in selectedRuns if len(s) >= minCsSize])
 
         # stop if no runs were found matching the criteria
         if not len(selectedRuns):
-            logging.info(('No feasible runs found in matching round %s. Stopping ' + \
-                    'iteration.') % (noReps-repMatching+2))
+            log.info('No feasible runs found in matching round %s. Stopping iteration.' %
+                     (noReps-repMatching+2))
             break
 
-        logging.info('%s feasible runs retained.' %len(selectedRuns))
+        log.info('%s feasible runs retained.' % len(selectedRuns))
 
         # reconciliate with result data
-        g2pos = dict(izip(g2_mod, xrange(len(g2_mod))))
-        g1pos = dict(izip(g1_mod, xrange(len(g1_mod))))
-        g2pos_res = dict(izip(g2_mod_res, xrange(len(g2_mod_res))))
-        g1pos_res = dict(izip(g1_mod_res, xrange(len(g1_mod_res))))
-        
-        chr_srt = lambda x, y: x[0] == y[0] and (x[1] < y[1] and -1 or 1) or (x[0] < y[0] and -1 or 1)
-        g1_mod_new = sorted(set(g1_mod_res + g1_mod), cmp=chr_srt)
-        g2_mod_new = sorted(set(g2_mod_res + g2_mod), cmp=chr_srt)
+        g2pos = dict(zip(g2_mod, range(len(g2_mod))))
+        g1pos = dict(zip(g1_mod, range(len(g1_mod))))
+        g2pos_res = dict(zip(g2_mod_res, range(len(g2_mod_res))))
+        g1pos_res = dict(zip(g1_mod_res, range(len(g1_mod_res))))
+
+        g1_mod_new = sorted(set(g1_mod_res + g1_mod), key=lambda x: (x[0], x[1]))
+        g2_mod_new = sorted(set(g2_mod_res + g2_mod), key=lambda x: (x[0], x[1]))
         g1_runs_new = list()
         g2_runs_new = list()
 
         for g1i in g1_mod_new:
             x = set()
-            if g1pos_res.has_key(g1i):
+            if g1i in g1pos_res:
                 x.update(g1_runs_res[g1pos_res[g1i]])
-            if g1pos.has_key(g1i):
+            if g1i in g1pos:
                 x.update(g1_runs[g1pos[g1i]])
             g1_runs_new.append(x)
 
         for g2j in g2_mod_new:
             x = set()
-            if g2pos_res.has_key(g2j):
+            if g2j in g2pos_res:
                 x.update(g2_runs_res[g2pos_res[g2j]])
-            if g2pos.has_key(g2j):
+            if g2j in g2pos:
                 x.update(g2_runs[g2pos[g2j]])
-            g2_runs_new.append(x) 
+            g2_runs_new.append(x)
 
         g1_mod_res = g1_mod_new
         g2_mod_res = g2_mod_new
         g1_runs_res = g1_runs_new
         g2_runs_res = g2_runs_new
 
-        selectedRuns_res.extend(selectedRuns) 
+        selectedRuns_res.extend(selectedRuns)
         repMatching -= 1
 
     return (g1_mod_res, g2_mod_res, g1_runs_res, g2_runs_res, selectedRuns_res)
 
+
 def printMatching(g1, g2, g1_runs, hasMultipleChromosomes, out):
 
-    if hasMultipleChromosomes:
-        print >> f, 'Chr(G1)\tG1\tChr(G2)\tG2\tdirection\tedge weight'
-    else:
-        print >> f, 'G1\tG2\tdirection\tedge weight'
-
-    g2pos = dict(izip(g2, xrange(len(g2))))
-    g1pos = dict(izip(g1, xrange(len(g1))))
-
-
-    cur_index = dict()
-    for i in xrange(len(g1_runs)):
-        run_set = g1_runs[i]
-        for run in run_set:
-            g1i = g1[i]
-            j = 0
-            if cur_index.has_key(run):
-                j = cur_index[run]
-            if run.direction == DIRECTION_CRICK_STRAND:
-                g2j = g2[g2pos[run.startG2] + j]
-            else:
-                g2j = g2[g2pos[run.endG2] - j]
+    with open(out, 'w') as of:
 
-            direction = run.direction == DIRECTION_CRICK_STRAND and '1' or '-1'
+        if hasMultipleChromosomes:
+            of.write('Chr(G1)\tG1\tChr(G2)\tG2\tdirection\tedge weight\n')
+        else:
+            of.write('G1\tG2\tdirection\tedge weight\n')
 
-            g1i1 = g1i[1] == -1 and 'TELOMERE_START' or g1i[1]
-            g1i1 = g1i[1] == maxint and 'TELOMERE_END' or g1i1
-            g2j1 = g2j[1] == -1 and 'TELOMERE_START' or g2j[1]
-            g2j1 = g2j[1] == maxint and 'TELOMERE_END' or g2j1
+        g2pos = dict(zip(g2, range(len(g2))))
+        # g1pos = dict(zip(g1, range(len(g1))))
 
-            if hasMultipleChromosomes:
-                print >> f, '%s\t%s\t%s\t%s\t%s\t%s' %(g1i[0], g1i1, g2j[0],
-                        g2j1, direction, run.weight[j])
-            else:
-                print >> f, '%s\t%s\t%s\t%s' %(g1i1, g2j1, direction,
-                        run.weight[j])
+        cur_index = dict()
+        for i in range(len(g1_runs)):
+            run_set = g1_runs[i]
+            for run in run_set:
+                g1i = g1[i]
+                j = 0
+                if run in cur_index:
+                    j = cur_index[run]
+                if run.direction == '+':
+                    g2j = g2[g2pos[run.startG2] + j]
+                else:
+                    g2j = g2[g2pos[run.endG2] - j]
 
-            cur_index[run] = j+1
+                direction = run.direction == '+' and '1' or '-1'
 
-if __name__ == '__main__':
-    if len(argv) < 3 or len(argv) > 8:
-        print '\tusage: %s <DIST FILE> <ALPHA> [ <EDGE WEIGHT THRESHOLD> --repeat-matching (-R) <NUMBER >= 2> --min-cs-size (-M) <NUMBER >= 1> ]' %argv[0]
-        exit(1)
-   
-    repMatching= '--repeat-matching' in argv or '-R' in argv
-    minCsSize = '--min-cs-size' in argv or '-M' in argv
-
-    if minCsSize:
-        pos = '-M' in argv and argv.index('-M') or argv.index('--min-cs-size') 
-        minCsSize = int(argv[pos+1])
-        argv = argv[:pos] + argv[pos+2:]
-        if not repMatching:
-            print >> stderr, ('Argument --min-cs-size (-M) only valid in ' + \
-                    'combination with --repeat-matching (-R)')
-            exit(1)
-    else:
-        minCsSize = 1
-    if repMatching: 
-        pos = '-R' in argv and argv.index('-R') or argv.index('--repeat-matching') 
-        repMatching = int(argv[pos+1]) - 1
-        argv = argv[:pos] + argv[pos+2:]
-    else:
-        repMatching = 0
+                g1i1 = g1i[1] == -1 and 'TELOMERE_START' or g1i[1]
+                g1i1 = g1i[1] == maxsize and 'TELOMERE_END' or g1i1
+                g2j1 = g2j[1] == -1 and 'TELOMERE_START' or g2j[1]
+                g2j1 = g2j[1] == maxsize and 'TELOMERE_END' or g2j1
 
-    # set as global parameter
-    alpha = float(argv[2])
-    edgeThreshold = len(argv) == 4 and float(argv[3]) or 0
+                if hasMultipleChromosomes:
+                    of.write('%s\t%s\t%s\t%s\t%s\t%s\n' %
+                             (g1i[0], g1i1, g2j[0], g2j1, direction, run.weight[j]))
+                else:
+                    of.write('%s\t%s\t%s\t%s\n' % (g1i1, g2j1, direction, run.weight[j]))
 
-#    logFileName = '%s.log' %(basename(argv[1]).rsplit('.')[0])
-    logFileName = '%s.log' %(argv[1].rsplit('.')[0])			# Respect given path
-    logging.basicConfig(filename=logFileName,filemode='w', level=logging.INFO,
-            format= "%(levelname)s\t%(asctime)s\t++ %(message)s")
+                cur_index[run] = j+1
 
-    greedy = 10.**-7
 
-    hasMultipleChromosomes, g1, g2, dists = readDistsAndOrder(open(argv[1]), edgeThreshold)
-    g1_mod, g2_mod, g1_runs, g2_runs, selectedRuns = findRandomRunSequence(g1,
-            g2, dists, greedy)
+if __name__ == '__main__':
+    from argparse import ArgumentParser as AP
+    cli = AP(description='')
+    cli.add_argument('-R', '--repeat-matching', type=int, metavar='N', default=0,
+                     help='match N repetitions (default: 0)')
+    cli.add_argument('-M', '--min-cs-size', type=int, metavar='N', default=1,
+                     help='minimal cs size (default: 1)')  # should be conditional on -R
+    cli.add_argument('-g', '--greedy', type=float, metavar='F', default=10.**-7)
+    cli.add_argument('-e', '--edge_weight_threshold', type=float, default=0.0)
+    cli.add_argument('-a', '--alpha', type=float, metavar='F', default=0.5)
+    cli.add_argument('dist_file')
+    args = cli.parse_args()
+    AlPHA = args.alpha
+    repMatching = args.repeat_matching
+    if repMatching > 0:
+        repMatching -= 1
+
+    log.basicConfig(filename=args.dist_file.rsplit('.')[0]+'.log', level=log.INFO,
+                    format="%(levelname)s\t%(asctime)s\t++ %(message)s")
+
+    multiChrom, g1, g2, dists = readDistsAndOrder(args.dist_file, args.edge_weight_threshold)
+    g1_mod, g2_mod, g1_runs, g2_runs, selectedRuns = findRandomRunSequence(g1, g2, dists, args.greedy)
     checkMatching(g1_mod, g2_mod, g1_runs, g2_runs, selectedRuns, dists)
 
     # calculate number of breakpoints only from result of the first matching
-    bkp = len(selectedRuns) -1
+    bkp = len(selectedRuns) - 1
 
     g1_mod, g2_mod, g1_runs, g2_runs, selectedRuns_new = repeatMatching(g1, g2,
-            g1_mod, g2_mod, g1_runs, g2_runs, dists, repMatching, minCsSize, greedy)
+            g1_mod, g2_mod, g1_runs, g2_runs, dists, repMatching, args.min_cs_size, args.greedy)
 
     selectedRuns.update(selectedRuns_new)
 
-    #
-    # calculate additional values
-    #
-    
     # sum of weights of adjacencies
     wAdj = sum([r.getWeight(1) for r in selectedRuns])
     # sum of weights of all edges of the matching
-    wEdg = sum([sum(map(lambda x: x**2, r.weight)) for r in selectedRuns])
+    wEdg = sum([sum([x**2 for x in r.weight]) for r in selectedRuns])
 
     edg = sum(map(len, selectedRuns))
 
-    #
-    # print matching
-    #
-
-#    out_file = basename(argv[1])
-    out_file = argv[1] # respect given path
-    f = open('%s.matching' %out_file[:out_file.rfind('.')], 'w')
-    printMatching(g1_mod, g2_mod, g1_runs, hasMultipleChromosomes, f)
-    f.flush()
-    f.close()
+    matchfile = args.dist_file.rsplit('.')[0]+'.matching'
+    printMatching(g1_mod, g2_mod, g1_runs, multiChrom, matchfile)
 
     #
     # print matching scores
     #
 
-    logging.info(('FFAdj-MCS finished. Breakpoint distance between G1 and G2' + \
-            ' is %s with #edg = %s, adj(M) = %.3f and edg(M) = %.3f') %(bkp, edg,
-                wAdj, wEdg))
-
-    print '#bkp\t#edg\tadj\tedg'
-    print '%s\t%s\t%.6f\t%.6f' %(bkp, edg, wAdj, wEdg)
+    log.info(('FFAdj-MCS finished. Breakpoint distance between G1 and G2' +
+              ' is %s with #edg = %s, adj(M) = %.3f and edg(M) = %.3f') %
+             (bkp, edg, wAdj, wEdg))
 
+    print('#bkp\t#edg\tadj\tedg')
+    print('%s\t%s\t%.6f\t%.6f' % (bkp, edg, wAdj, wEdg))



View it on GitLab: https://salsa.debian.org/med-team/proteinortho/commit/0d5a5400614659b27b8174f8fbdc4e2e637526d0

-- 
View it on GitLab: https://salsa.debian.org/med-team/proteinortho/commit/0d5a5400614659b27b8174f8fbdc4e2e637526d0
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190925/877a3e65/attachment-0001.html>