[med-svn] [proteinortho] 02/07: Imported Upstream version 5.12+dfsg

Andreas Tille tille at debian.org
Thu Mar 10 19:55:28 UTC 2016


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository proteinortho.

commit d8a5c6c74466dee2d33e858dd7e78451d499dafc
Author: Andreas Tille <tille at debian.org>
Date:   Thu Mar 10 20:28:30 2016 +0100

    Imported Upstream version 5.12+dfsg
---
 Makefile                     |   7 +-
 basic-types.h                |  58 ++++++
 ffadj_mcs.py                 |   6 +-
 manual.html                  |   3 +-
 po2tree.pl                   | 161 ++++++++++++++
 po_tree                      | Bin 0 -> 17650 bytes
 po_tree.c                    | 484 +++++++++++++++++++++++++++++++++++++++++++
 proteinortho5.pl             |  20 +-
 proteinortho5_clustering.cpp |  36 +++-
 proteinortho5_tree           | Bin 0 -> 72584 bytes
 10 files changed, 761 insertions(+), 14 deletions(-)

diff --git a/Makefile b/Makefile
index bb54a9d..3c8b861 100644
--- a/Makefile
+++ b/Makefile
@@ -4,15 +4,18 @@ INSTALLDIR=/usr/local/bin
 CPP      = g++
 CPPFLAGS   = -Wall -O3
 
-all: proteinortho5_clustering proteinortho5_clean_edges
+all: proteinortho5_clustering proteinortho5_clean_edges proteinortho5_tree
 
 proteinortho5_clustering: proteinortho5_clustering.cpp
 	$(CPP) $(CPPFLAGS) -o $@ $<
 
+proteinortho5_tree: proteinortho5_clustering.cpp
+	$(CPP) $(CPPFLAGS) -o $@ $<
+
 proteinortho5_clean_edges: proteinortho5_clean_edges.cpp
 	$(CPP) $(CPPFLAGS) -o $@ $<
 
-install: proteinortho5.pl proteinortho5_clustering proteinortho5_clean_edges2.pl proteinortho5_singletons.pl ffadj_mcs.py
+install: proteinortho5.pl proteinortho5_clustering proteinortho5_singletons.pl proteinortho5_clean_edges2.pl ffadj_mcs.py po_tree po_tree.pl
 	install -v $^ $(INSTALLDIR)
 
 test: proteinortho5.pl proteinortho5_clustering
diff --git a/basic-types.h b/basic-types.h
new file mode 100644
index 0000000..5aa1dcd
--- /dev/null
+++ b/basic-types.h
@@ -0,0 +1,58 @@
+#ifndef BASIC_TYPES_H
+#define BASIC_TYPES_H
+
+
+#ifdef __CYGWIN__ 
+
+#define CRLF '\r'
+
+#else
+#define CRLF ' '
+#endif
+
+#define MAXBUFFERSIZE 10000
+#define BASEINC 10000
+typedef unsigned char Uchar;
+typedef unsigned int Uint;
+typedef signed int Sint;
+typedef unsigned char BOOL;
+
+#define True '1'
+#define False '0'
+
+typedef struct {
+  int  a, 
+       b;
+} PairSint; 
+
+
+typedef struct {
+  Uint  a, 
+       b;
+} PairUint; 
+
+
+typedef struct {
+  Uint a,
+      b,
+      c;
+} TripleUint;
+
+
+typedef struct {
+  int a,
+      b,
+      c;
+} TripleSint;
+
+typedef struct {
+  int a,
+      b,
+      c,
+      d;
+} QuadSint;
+
+
+
+#endif
+
diff --git a/ffadj_mcs.py b/ffadj_mcs.py
index 4967ffd..485af76 100755
--- a/ffadj_mcs.py
+++ b/ffadj_mcs.py
@@ -842,7 +842,8 @@ if __name__ == '__main__':
     alpha = float(argv[2])
     edgeThreshold = len(argv) == 4 and float(argv[3]) or 0
 
-    logFileName = '%s.log' %(basename(argv[1]).rsplit('.')[0])
+#    logFileName = '%s.log' %(basename(argv[1]).rsplit('.')[0])
+    logFileName = '%s.log' %(argv[1].rsplit('.')[0])			# Respect given path
     logging.basicConfig(filename=logFileName,filemode='w', level=logging.INFO,
             format= "%(levelname)s\t%(asctime)s\t++ %(message)s")
 
@@ -876,7 +877,8 @@ if __name__ == '__main__':
     # print matching
     #
 
-    out_file = basename(argv[1])
+#    out_file = basename(argv[1])
+    out_file = argv[1] # respect given path
     f = open('%s.matching' %out_file[:out_file.rfind('.')], 'w')
     printMatching(g1_mod, g2_mod, g1_runs, hasMultipleChromosomes, f)
     f.flush()
diff --git a/manual.html b/manual.html
index 0227a67..fac0228 100644
--- a/manual.html
+++ b/manual.html
@@ -4,7 +4,7 @@
 	</head>
 <body>
 <h1>Proteinortho Manual / PoFF Manual</h1>
-<small>This manual corresponds to version 5.11</small>
+<small>This manual corresponds to version 5.12</small>
 <h2>Introduction</h2>
 Proteinortho is a tool to detect orthologous genes within different species.
 For doing so, it compares similarities of given gene sequences and clusters them to find significant groups.
@@ -175,6 +175,7 @@ They are:
 </ul></td></tr>
 
 <tr><td>-project=</td><td>myproject</td><td> prefix for all result file names</td></tr>
+<tr><td>-temp=</td><td>working directory</td><td> path for temporary files</td></tr>
 <tr><td>-synteny</td><td>-</td><td>activate PoFF extension to separate similar sequences using synteny (requires a GFF file for each FASTA file)</td></tr>
 <tr><td>-nograph</td><td>-</td><td>do not generate .graph files with pairwise orthology data<br>saves some time</td></tr>
 <tr><td>-keep</td><td>-</td><td>store temporary blast results for reuse</td></tr>
diff --git a/po2tree.pl b/po2tree.pl
new file mode 100755
index 0000000..77d5435
--- /dev/null
+++ b/po2tree.pl
@@ -0,0 +1,161 @@
+#!/usr/bin/perl
+
+##########################################################################################
+#	  This file is part of proteinortho.
+#	  (C) 2009 Marcus Lechner
+# 
+#	  proteinortho is free software; you can redistribute it and/or modify
+#	  it under the terms of the GNU General Public License as published
+#	  by the Free Software Foundation; either version 2, or (at your
+#	  option) any later version.
+#
+#	  proteinortho is distributed in the hope that it will be useful, but
+#	  WITHOUT ANY WARRANTY; without even the implied warranty of
+#	  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#	  General Public License for more details.
+#
+#	  You should have received a copy of the GNU General Public License
+#	  along with proteinortho; see the file COPYING.  If not, write to the
+#	  Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+#	  Boston, MA 02111-1307, USA.	
+##########################################################################################
+
+##########################################################################################
+# About
+##########################################################################################
+# orthomatrix2tree
+# input ortheset output from proteinortho
+# output corresponding UPGMA tree in newick format
+#
+# branch labels show the number of common genes in the subtrees
+# branch lengths represent the number of new common genes since the last node
+# 
+# @author Marcus Lechner
+# @email lechner at staff.uni-marburg.de
+# @company Bioinformatics, University of Leipzig
+# @version 3.10
+# @date 08-12-2015
+#
+##########################################################################################
+
+##########################################################################################
+# Imports
+##########################################################################################
+use strict;
+use warnings "all";
+use File::Basename;
+
+
+##########################################################################################
+# Usage, defaults and parameters
+##########################################################################################
+my $usage= << "JUS";
+  usage:   po2tree.pl [OPTIONS] ORTHOMATRIX >OUTTREE
+  input:   output from Proteinortho5
+  output:  corresponding UPGMA tree in newick format
+  options: -o=[FILE]  prints output to the given file rather than STDOUT  
+JUS
+
+# path to the C-part of this program
+my @tmppath = fileparse($0);
+our $scriptpath = $tmppath[1];
+
+# Output switch
+my $o = "";
+my $path = "";
+my $file = "";
+
+# No parameters
+unless (defined($ARGV[0])) {
+	print "Error: No input file defined!\n\n$usage";
+	exit 1;
+}
+
+foreach (@ARGV) {
+	if 	($_ =~ /^--?o.*=(.+)/) {$o = $1;}						# output file
+	elsif	($_ =~ /^-/) {	print STDERR "Unknown parameter $_\n$usage"; exit 1;}
+	elsif	(!-e $_) {	print STDERR "Could not find file $_\n"; exit 1;}
+	else 	{$file = $_;}
+}
+
+# open the given file, this should be a proteinortho matrix
+open(SOURCE,"<$file") || die("Could not open file $file\n$!");
+# write temp file
+open(TRG,">$file.tmp.matrix2") || die("Could not open tmp-file $file.tmp.matrix2, $!");
+my $did = 0;
+my @viecher = ();
+
+my @present_in_groups;
+# convert the orthoset to a 1/0 matrix
+print STDERR "Preparing datafile...\n";
+while(<SOURCE>) {
+	my @data = split;
+	# first line
+	if ($data[0] =~ /^\#/) {
+		# remove the first elements
+		shift(@data);shift(@data);shift(@data);
+
+		# first line
+		if ($did == 0) {
+			shift(@data);
+			# species names to header
+			@viecher = @data;
+		}
+	
+	}
+	else {
+		shift(@data);shift(@data);shift(@data);
+		# any futher line
+		my $i = 0;
+		foreach (@data) {
+			# just transform the entries to 0 and 1
+			# the gene-names are not important here
+			if ($_ eq "*") {
+				print TRG 0;
+			}
+			else {
+				# Count number of copies
+				my $num = 1;
+#				$num += $_ =~ /,/g;		# count number of copies (not yet respected in C)
+				print TRG $num;
+				$present_in_groups[$i]++; # count group
+			}
+			print TRG " ";
+			$i++;
+		}
+		print TRG "\n";
+	}
+}
+close(SOURCE);
+close(TRG);
+
+open(TMP,">$file.tmp.matrix") || die("Could not open tmp-file $file.tmp.matrix, $!");
+	for (my $i = 0; $i < scalar(@viecher); $i++) {
+		$viecher[$i] =~ s/\..*?$//; 			# remove point
+		$viecher[$i] =~ s/\s/_/g; 			# alphabet
+		$viecher[$i] .= "_[$present_in_groups[$i]]";	# add number of groups
+	}
+	print TMP scalar(@viecher)."\n";
+	print TMP join(" ", at viecher)."\n";
+close(TMP);
+system("cat '$file.tmp.matrix2' >>'$file.tmp.matrix'");
+
+# Run the main algorithm in C
+print STDERR "Calculating tree...\n";
+my $run = $scriptpath."po_tree";
+my $out = qx($run $ARGV[0].tmp.matrix);
+$out =~ s/\[.+?\]//g;
+$out =~ s/\s:/:/g;
+
+if ($o ne "") {
+	open(OUT,">$o") || die("Could not write output file $!");
+	print OUT $out;
+	close(OUT);
+}
+else {
+	print $out;
+}
+
+# unlink temporary files
+unlink("$file.tmp.matrix");
+unlink("$file.tmp.matrix2");
diff --git a/po_tree b/po_tree
new file mode 100755
index 0000000..09ca5a2
Binary files /dev/null and b/po_tree differ
diff --git a/po_tree.c b/po_tree.c
new file mode 100644
index 0000000..8b6604d
--- /dev/null
+++ b/po_tree.c
@@ -0,0 +1,484 @@
+/*
+	  This file is part of proteinortho.
+	  (C) 2009 Marcus Lechner
+ 
+	  proteinortho is free software; you can redistribute it and/or modify
+	  it under the terms of the GNU General Public License as published
+	  by the Free Software Foundation; either version 2, or (at your
+	  option) any later version.
+ 
+	  proteinortho is distributed in the hope that it will be useful, but
+	  WITHOUT ANY WARRANTY; without even the implied warranty of
+	  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+	  General Public License for more details.
+ 
+	  You should have received a copy of the GNU General Public License
+	  along with proteinortho; see the file COPYING.  If not, write to the
+	  Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+	  Boston, MA 02111-1307, USA.	
+*/
+
+/*
+ *  TreebuilderC - core part of proteinorthos treebulider
+ *  generates a newicktree from proteinortho's affinity matrix
+ *
+ *  branchlabels show the number of common genes in the subtrees
+ *  branchlengths represent the number of new common genes since the last node
+ *
+ *  @author Marcus Lechner, Lydia Steiner
+ *  @email marcus at bioinf.uni-leipzig.de
+ *  @company Bioinformatics, University of Leipzig
+ *  @version 1.10 (for PO5)
+ *  @date 2016-02-22
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "basic-types.h"
+
+BOOL** fetch_matrix(Uint* ccs, Uint* viecher, char* filename, Uint size, char*** pnamen);
+void builder(BOOL** matrix, Uint viecher, Uint ccs, char*** pnamen);
+void merge (Uint* max, Uint css, Uint viecher, BOOL* away, BOOL** matrix, int** aff, char*** pnamen);
+BOOL getmax(BOOL** matrix, BOOL* away, Uint viecher, Uint ccs, int** aff, char*** pnamen);
+int sim(BOOL** matrix, Uint a, Uint b, Uint ccs);
+char* myitoa(int zahl, int base);
+int get_number(char* word);
+
+int main(int argc, char** argv) {
+	/* print error if no filename given */
+	if (argc != 2) {
+		fprintf(stderr,"Error: no filename given.\nUseage: tbc FILENAME\n\n");		
+		return 1;
+	} 
+
+	/* init variables */
+	/* No. of ccs */
+	Uint ccs;									
+	/* No. of species */
+	Uint viecher;	
+	/* "string"-array of names, needs to be a pointer for it gets reallocaded by fetch_matrix */	
+	char*** pnamen = (char***)malloc(sizeof(char**));							
+	/* Just a counter */							
+	Uint i;									
+	
+	fprintf(stderr," >Loading data...");
+	/* read data from file */	
+	BOOL** matrix = fetch_matrix(&ccs, &viecher, argv[1], 10000, pnamen);
+	fprintf(stderr,"done\n");
+
+	fprintf(stderr," >Generating tree...");
+	/* Main part: generate the tree from matrix */	
+	builder(matrix,viecher,ccs,pnamen);
+	fprintf(stderr,"done\n");
+
+
+	/* substitute every "_" to a whitespace in output tree */	
+	i = 0;
+	while ((*pnamen)[0][i] != '\0') {
+		if ((*pnamen)[0][i] == '_') {(*pnamen)[0][i] = ' ';}
+		i++;
+	}
+	/* and print it to stdout */	
+	printf("%s;\n",(*pnamen)[0]);
+
+	/* free memory */
+	for(i = 0; i < viecher; i++){free(matrix[i]);}
+	free(matrix);
+//	for(i = 0; i < viecher; i++){free((*pnamen)[i]);}
+	/* all other entries where already freed */
+	free((*pnamen)[0]);
+	free(*pnamen);free(pnamen);
+
+	return 0;
+}
+
+/*  
+ *  generates the tree
+ *  matrix 	- ccs as adjacent matrix (0/1) 
+ *  viecher	- number of species
+ *  ccs	- number of ccs
+ *  pnamen	- species names ordered the same way as matrix
+ *  returns: nothing, but pnamen[0] contains the tree afterwards
+ */
+void builder(BOOL** matrix, Uint viecher, Uint ccs, char*** pnamen) {
+	/* affinity matrix */
+	int** aff = (int**)malloc(sizeof(int*)*viecher);
+	int i;
+	/* allocate further memory for affinity matrix */
+	for (i = 0; i<viecher; i++) {
+		aff[i] = (int*)malloc(sizeof(int)*viecher);
+		/* initiate every entry with -1 */
+		memset(aff[i], -1, sizeof(int)*viecher);
+	}
+	
+	/* array to remember merged species */
+	BOOL away[viecher];
+	/* initiate every entry with 0 */
+	memset(away, 0, sizeof(BOOL)*viecher);
+
+	/* build the tree as long as there are at least 2 species */
+	while (getmax(matrix, away, viecher, ccs, aff, pnamen));
+
+	/* pnamen[0] contains the tree now */
+
+	/* free affinity matrix */
+	for (i = 0; i<viecher; i++) {
+		free(aff[i]);
+	}
+	free(aff);
+}
+
+/*  
+ *  merges two species in adjacent matrix
+ *  only the common features (= 1 enties) stay, rest becomes 0
+ *  loss of one species is marked in away-array
+ *  need for recalculation of affinity is marked in aff with -1
+ *  besides pnamen for the tree is bulided as merged
+ *  max	- contains to entries which should be merged (the one, with the maximum similarity)
+ *  ccs	- number of ccs
+ *  viecher	- number of species
+ *  away	- vector which tells which entry of matrix has been removed (= marked as removed)
+ *  matrix 	- ccs as adjacent matrix (0/1)
+ *  aff	- affinity matrix
+ *  pnamen	- species names ordered the same way as matrix (including merged steps)
+ *  returns:  nothing, but all involved data-structures represent the merge afterwards correctly
+ */
+void merge (Uint* max, Uint ccs, Uint viecher, BOOL* away, BOOL** matrix, int** aff, char*** pnamen) {
+	int i, counter = 0;	
+
+	/* only keep components that are present in both species */
+	/* foreach component */	
+	for (i = 0; i < ccs; i++) {
+		/* if species 1 has it */
+		if (matrix[max[0]][i] == '1') {
+			/* give it the value of species 2, */
+			matrix[max[0]][i] = matrix[max[1]][i];
+		}
+		/* now there is only a one, if both had it */
+	}
+
+	/* counter = affinity of the merged species */
+	counter = aff[max[0]][max[1]];
+
+	/* mark the species as away, the entry will not be used again */
+	away[max[1]] = 1;
+	/* mark every entry within the affinity matrix as 'recaculate me' */
+	for (i = 0; i < viecher; i++) {
+		/* only the upper triangle matrix is used */
+		aff[max[0]][i] = -1;
+		/* vice versa not needed */
+	}
+
+	/* fetch the number of proteins the specie's subtrees have in common */
+	int gemein_0 = get_number((*pnamen)[max[0]]);
+	int gemein_1 = get_number((*pnamen)[max[1]]);
+
+	/* transform the number to text 
+	   the difference with their affinity is the new branch length (= distance) */
+	char* length_0 = myitoa(gemein_0-counter,10);
+	char* length_1 = myitoa(gemein_1-counter,10);
+
+	/* transform the number of common genes (= affinity) to text */
+	char* anz = myitoa(counter,10);
+	/* calculate how many place is needed for the new string 
+         sum of both species strings (= subtree) + new numbers + parenteses + : + \0 */
+	int newsize = strlen((*pnamen)[max[0]])+strlen((*pnamen)[max[1]])+strlen(anz)+strlen(length_0)+strlen(length_1)+8;
+	/* and malloc */
+	char* baum = (char*)malloc(sizeof(char)*newsize);
+	/* make it an empty string, so strcat can handle it right */
+	baum[0] = '\0';
+
+	/* bulid the new string (= merged subtrees) */
+	baum = strcat(baum,"(");
+	baum = strcat(baum,(*pnamen)[max[0]]);
+	baum = strcat(baum,":");
+	baum = strcat(baum,length_0);
+	baum = strcat(baum,",");
+	baum = strcat(baum,(*pnamen)[max[1]]);
+	baum = strcat(baum,":");
+	baum = strcat(baum,length_1);
+/	baum = strcat(baum,")");
+//	baum = strcat(baum,")[");
+//	baum = strcat(baum,anz);
+//	baum = strcat(baum,"]");
+
+	/* free the space both merged subtrees */	
+	free((*pnamen)[max[0]]);
+	free((*pnamen)[max[1]]);
+	/* store the merged tree in place of the first subtree */	
+	(*pnamen)[max[0]] = baum;
+
+	/* free */
+	free(length_0);
+	free(length_1);
+	/* pointer gotten from myitoa */
+	free(anz);
+
+}
+
+
+/*  
+ *  finds the two species that have the highest similarity and merges them
+ *  matrix 	- ccs as adjacent matrix (0/1)
+ *  away	- vector which tells which entry of matrix has been removed (= marked as removed)
+ *  viecher	- number of species
+ *  ccs	- number of ccs
+ *  aff	- affinity matrix
+ *  pnamen	- species names ordered the same way as matrix (including merged steps)
+ *  returns:  1 if there was a pair, 0 if there is no species/cluster left to comare (= finished)
+ *            pnamen[0] contains the complete tree afterwards
+ */
+BOOL getmax(BOOL** matrix, BOOL* away, Uint viecher, Uint ccs, int** aff, char*** pnamen) {
+	Uint i, j;
+	int k, max = -1;
+	
+	/* Storage for the best pair */
+	Uint bestpair[2];
+	bestpair[0] = 0;
+	bestpair[1] = 0;
+
+	/* foreach species 1 (line) */
+	for (i = 0; i < viecher-1; i++) {
+		/* skip the linies with away entries */
+		if (away[i]) {continue;}
+
+		/* foreach species 2 (row) */
+		for (j = i+1; j < viecher; j++) {
+			/* skip the rows with away entries */
+			if (away[j]) {continue;}
+	
+			/* recalculate similarity if needed */
+			if (aff[i][j] == -1) {aff[i][j] = sim(matrix,i,j,ccs);}
+	
+			/* if similarity is bigger than best until now */
+			k = aff[i][j];
+			if (max < k) {
+				/* store it and the corresponding indices */
+				max = k;
+				bestpair[0] = i;
+				bestpair[1] = j;
+			}
+		}
+	} 
+	
+	/* if both entries are the same (as initiated), we are done */
+	if (bestpair[0] == bestpair[1]) {
+		/* get the last branch length of the tree */
+		char* anz = myitoa(get_number((*pnamen)[0]),10);
+		/* recalculate size of the tree */
+		int newsize = strlen((*pnamen)[0])+strlen(anz)+2;
+		/* realloc */
+		(*pnamen)[0] = (char*)realloc((*pnamen)[0],sizeof(char)*newsize);
+		/* add the information about the root's branchlength */
+		(*pnamen)[0] = strcat((*pnamen)[0],":");
+		(*pnamen)[0] = strcat((*pnamen)[0],anz);
+		/* free myitoa's string */
+		free(anz);
+		/* return finished */
+		return 0;
+	}
+
+	/* else, merge the pairs */
+	merge(bestpair,ccs,viecher,away,matrix,aff, pnamen);
+
+	/* return done */
+	return 1;
+}
+
+/*
+ *  calculates the similarity between two speices
+ *  matrix 	- ccs as adjacent matrix (0/1)
+ *  a		- species 1
+ *  b		- species 2
+ *  ccs	- number of ccs
+ *  return	- their simialarity (= number of common proteins)
+ */
+int sim(BOOL** matrix, Uint a, Uint b, Uint ccs) {
+	Uint i;
+	int counter = 0;
+	/* foreach connected component */
+	for (i = 0; i < ccs; i++) {
+		/* if both species have an entry, count */
+		if (matrix[a][i] == '1' && matrix[b][i] == '1') counter++;
+	}
+	/* return the sum of common entries */
+	return counter;
+}
+
+/*
+ *  fetches the matrix from a file
+ *  ccs	- number of ccs (will be defined here)
+ *  viecher	- number of species (will be defined here)
+ *  filename- file to fetch the data from
+ *  size	- initialy allocated size for the the matrix
+ *  pnamen	- species names ordered the same way as in the returned matrix  (will be defined here)
+ *  return	- pointer to the matrix
+ *
+ *  file format:
+ *  no. of species
+ *  names of species, tab seperated
+ *  linewise connected component entries with tab seperated 1 or 0
+ */
+BOOL** fetch_matrix(Uint* ccs, Uint* viecher, char* filename, Uint size, char*** pnamen) {
+	Uint i;
+
+	/* open 0/1 ccs matrix */
+	FILE* datei = fopen (filename, "r");
+	/* read first line = no. of species */
+	fscanf (datei, "%d\n", viecher);
+	/* allocate memory for species names */
+	*pnamen = (char**) malloc(sizeof(char*)*(*viecher));
+	/* read species names and genecount from 2nd line of file */
+	for (i = 0; i < (*viecher); i++) {
+		/* allocate memory for name */
+		/* Attention: dont use strings longer than 255 characters! */
+		(*pnamen)[i] = (char*) malloc(sizeof(char)*255);
+		/* load name in allocated memory */		
+		if (i < (*viecher)-1) {
+			/* normal case: */		
+			fscanf (datei, "%s ",(*pnamen)[i]);
+		}
+		else {	
+			/* special case: last species name is followed by newline */
+			fscanf (datei, "%s \n",(*pnamen)[i]);
+		}		
+		/* resize memory to what is needed */
+		(*pnamen)[i] = (char*) realloc((*pnamen)[i],sizeof(char)*(strlen((*pnamen)[i])+1));
+	}
+	
+	/* number of connected components is 0 at the beginnig */
+	*ccs = 0;
+	/* allocate memory for 0/1 ccs matrix */
+	BOOL** matrix = (BOOL**) malloc(sizeof(BOOL*)*(*viecher));
+
+	/* Rowwise alloc */
+	for (i = 0; i < (*viecher); i++) {
+		matrix[i] = (BOOL*) malloc(sizeof(BOOL)*size);
+	}
+
+	/* will contain the first char read from line */
+	BOOL first;
+	
+	/* read ccs from file, line by line, start with the first char of each line */
+	while(fscanf(datei,"%c ",&first) != EOF){
+		/* if more ccs than allocated memory -> allocate more memory */
+		if(*ccs >= size)	{
+			size += 10000;
+			for (i = 0; i < (*viecher); i++) {
+				matrix[i] = (BOOL*) realloc(matrix[i],sizeof(BOOL)*size); // BOOL*
+			}
+		}
+		/* set first number to what was found first */		
+		matrix[0][*ccs] = first;
+		/* and gather the rest one by one from file */		
+		for (i = 1; i < (*viecher)-1; i++) {
+			/* store char in row */					
+			fscanf (datei, "%c ",&(matrix[i][*ccs]));
+		}
+		/* special case: last char is followed by newline */					
+		fscanf (datei, "%c \n",&(matrix[(*viecher)-1][*ccs]));
+		/* we have a new cc in memory */
+		(*ccs)++;
+	}
+	/* file read throug, close it */
+	fclose (datei);
+	
+	/* resize memory to what is needed for size is likely to be bigger than ccs */
+	for (i = 0; i < (*viecher); i++) {
+		matrix[i] = (BOOL*) realloc(matrix[i],sizeof(BOOL)*(*ccs));
+	}
+	
+	/* return the matrix pointer */
+	return matrix;
+}
+
+/*
+ *  finds the last [number] entry within a string and converts it to an int
+ *  word	- the string to scan
+ *  return	- number within the last [number] pattern
+ */
+int get_number(char* word) {
+	int i, start = -1, ende = -1;	
+	/* find start and end of the number */
+	for (i = strlen(word); i >= 0; i--) {
+		if (word[i] == ']') ende = i;
+		if (word[i] == '[') {
+			start = i+1;
+			break;
+		}
+	}
+	/* allocate memory for the number alone */
+	char zahl[ende-start+1];
+	/* copy the numbe to the new string */
+	for (i = 0; i < ende-start; i++) {
+		zahl[i] = word[start+i];
+	}
+	/* add the char finish sign */
+	zahl[ende-start] = '\0';
+	/* return the number as int */	
+	return atoi(zahl);
+}
+
+/* 
+ *  some compilers do not support itoa, so here is something which works for us 
+ *  converts positive numbers to the corresponding text-form as char-array
+ *  Attention: up to 64 signs are supported, base between 1 and 16
+ *  if 'x' is included in the output, sth. went wrong
+ *  zahl	- number to convert (should be postive
+ *  base	- base to witch number will be converted
+ *  returns - char array representing number zahl to base base as text
+ *            Attention: returnvalue needs to be freed separately
+ */
+char* myitoa(int zahl, int base) {
+	int i = 0, k;
+	char reverse[64];
+
+	/* Zero is easy, also negative numbers are cought here */
+	if(zahl == 0){
+		char* zero = (char*) malloc(sizeof(char)*2);
+		zero[0] = '0';
+		zero[1] = '\0';
+		return zero;
+	}
+
+	/* convert to a reverse-"string" by modulu-calculation */
+	while (zahl > 0) {
+		switch(zahl%base) {
+	        case  0:	reverse[i] = '0';break;
+	        case  1:	reverse[i] = '1';break;
+	        case  2:	reverse[i] = '2';break;
+	        case  3:	reverse[i] = '3';break;
+	        case  4:	reverse[i] = '4';break;
+	        case  5:	reverse[i] = '5';break;
+	        case  6:	reverse[i] = '6';break;
+	        case  7:	reverse[i] = '7';break;
+	        case  8:	reverse[i] = '8';break;
+	        case  9:	reverse[i] = '9';break;
+		  case  10:	reverse[i] = 'A';break;
+	        case  11:	reverse[i] = 'B';break;
+	        case  12:	reverse[i] = 'C';break;
+	        case  13:	reverse[i] = 'D';break;
+	        case  14:	reverse[i] = 'E';break;
+	        case  15:	reverse[i] = 'F';break;
+	        case  16:	reverse[i] = 'G';break;
+		  default:  reverse[i] = 'x';
+	       }
+		/* point one position further */		
+		i++;
+		zahl /= base;
+	}
+	/* it's actually one sign less */
+	i--;
+	/* allocate memory for output */
+	char* string = (char*) malloc(sizeof(char)*(i+2));
+	/* reverse to calculated char-array to have the right order of signs */
+	for (k = i; k >= 0; k--) {
+		string[i-k] = reverse[k];
+	}
+	/* it's a string, so finish it with \0 */
+	string[i+1] = '\0';
+	/* and return it */
+	return string;
+}
diff --git a/proteinortho5.pl b/proteinortho5.pl
index 5cf85c1..719e64f 100755
--- a/proteinortho5.pl
+++ b/proteinortho5.pl
@@ -30,7 +30,7 @@
 # @author Marcus Lechner
 # @email lechner at staff.uni-marburg.de
 # @company University of Maruburg
-# @date 2014-07-07
+# @date 2016-02-22
 #
 ##########################################################################################
 
@@ -47,7 +47,7 @@ use Thread::Queue;
 ##########################################################################################
 # Variables
 ##########################################################################################
-our $version = "5.11";
+our $version = "5.12";
 our $step = 0;		# 0/1/2/3	-> do all / only apply step 1 / only apply step 2 / only apply step 3
 our $verbose = 1;	# 0/1		-> don't / be verbose
 our $debug = 0;		# 0/1		-> don't / show debug data
@@ -77,6 +77,7 @@ our $clean = 0;
 our $blastOptions = "";
 our $nograph = 0;
 our $desc = 0;
+our $tmp_path = "";
 
 # Internal
 our $blastversion = "unknown";	# Auto-detected blast version
@@ -105,6 +106,10 @@ foreach my $option (@ARGV) {
   if ($option =~ m/^--?step=(0|1|2|3)$/)  		{ $step = $1;   }
   elsif ($option =~ m/^--?verbose$/) 			{ $verbose = 1;  }
   elsif ($option =~ m/^--?verbose=(0|1)$/) 		{ $verbose = $1;  }
+  elsif ($option =~ m/^--?te?mp=(.+)$/)	 		{ $tmp_path = $1; 
+								 # make sure it ends with /
+								unless ($tmp_path =~ /\/$/) {$tmp_path .= "/";}
+							}
   elsif ($option =~ m/^--?debug$/)     			{ $debug = 1;  }
   elsif ($option =~ m/^--?debug=(0|1)$/)     		{ $debug = $1;  }
   elsif ($option =~ m/^--?p=(.*)$/)     		{ $blastmode = $1; }
@@ -158,8 +163,8 @@ if ($startat != 0 || $stopat != -1) {
 our $simgraph = "$project.blast-graph$run_id";		# Output file graph
 our $syngraph = "$project.ffadj-graph$run_id";		# Output file synteny
 
-our $rm_simgraph = "$project.removed_blast-graph";		# Output remove graph
-our $rm_syngraph = "$project.removed_ffadj-graph";		# Output remove graph
+our $rm_simgraph = "$tmp_path$project.removed_blast-graph";		# Output remove graph
+our $rm_syngraph = "$tmp_path$project.removed_ffadj-graph";		# Output remove graph
 
 our $csimgraph = "$project.proteinortho-graph";		# Output file graph
 our $csyngraph = "$project.poff-graph";				# Output file synteny
@@ -280,6 +285,7 @@ Options: -e=          E-value for blast [default: 1e-05]
          -p=          blast program {blastn|blastp|blastn+|blastp+}
                       [default: blastp+]
          -project=    prefix for all result file names [default: myproject]
+         -temp=       path for temporary files [default: working directory]
          -synteny     activate PoFF extension to separate similar sequences
                       by contextual adjacencies (requires .gff for each .fasta)
          -dups=       PoFF: number of reiterations for adjacencies heuristic,
@@ -390,7 +396,7 @@ sub run_blast {
 
 sub workerthread {
 	my $thread_id = threads->tid();
-	my $temp_file = "$project-$run_id-$thread_id";
+	my $temp_file = "$tmp_path$project-$run_id-$thread_id";
 
 	# Clean up, just to be safe
 	unlink("$temp_file.tmp");
@@ -462,7 +468,7 @@ sub workerthread {
 			open(PREGRAPH,">>$temp_file.tmp") || die("Could not open temp file '$temp_file.tmp': $!");
 			print PREGRAPH $ordered_matches;
 			close(PREGRAPH);
-			my $cmd = "$po_path/ffadj_mcs.py $temp_file.tmp $alpha";
+			my $cmd = "$po_path/ffadj_mcs.py '$temp_file.tmp' $alpha";
 			if ($duplication) {
 				$cmd .= " --repeat-matching $duplication --min-cs-size $cs";
 			}
@@ -863,7 +869,7 @@ sub blast {
 	my $b = $_[1];
 	$a =~ s/^.*\///;
 	$b =~ s/^.*\///;
-	my $bla = "$a.vs.$b.bla";
+	my $bla = "$tmp_path$a.vs.$b.bla";
 
 	# File does not exists yet or I am forced to rewrite it
 	if (!(-s $bla) || $force) {
diff --git a/proteinortho5_clustering.cpp b/proteinortho5_clustering.cpp
index 1691eb4..a7bbba1 100755
--- a/proteinortho5_clustering.cpp
+++ b/proteinortho5_clustering.cpp
@@ -33,6 +33,7 @@ void clear_edges(vector<unsigned int>&);
 void partition_graph(void);
 void print_group(vector<unsigned int>& , float );
 void print_header(void);
+vector<unsigned int> get_deg_one (vector<unsigned int>& );
 
 // Parameters
 bool param_verbose 		= false;
@@ -285,6 +286,16 @@ unsigned int max_deg(vector<unsigned int>& nodes) {
 	return max;
 }
 
+// Return nodes with degree 1 of given protein_ids
+vector<unsigned int> get_deg_one (vector<unsigned int>& nodes) {
+	vector<unsigned int> one;
+	for (unsigned int i = 0; i < nodes.size(); i++) {
+		unsigned int degree = graph[nodes[i]].edges.size();
+		if (degree == 1) {one.push_back(nodes[i]);}
+	}
+	return one;
+}
+
 // Generate random vector x of size size
 vector<float> generate_random_vector(const unsigned int size) {
 	vector<float> x(size);
@@ -378,6 +389,17 @@ void removeExternalEdges(map<unsigned int,bool>& a) {
 
 // Split connected component according to eigenvector
 void splitGroups(vector<float>& y, vector<unsigned int>& nodes){
+	// Remove tree like structures in the beginning
+	vector<unsigned int> one = get_deg_one(nodes);
+	if (one.size() > 0) {
+		map<unsigned int,bool> tree;
+		for (unsigned int i = 0; i < one.size(); i++) {
+			{tree[one[i]] = true;}
+		}
+		removeExternalEdges(tree);
+		return;
+	}
+
 	// Store data about two groups
 	map<unsigned int,bool> groupA, groupB;
 	for (unsigned int i = 0; i < y.size(); i++) {
@@ -385,6 +407,12 @@ void splitGroups(vector<float>& y, vector<unsigned int>& nodes){
 		if (y[i] < 0) 		{groupA[nodes[i]] = true;} // cerr << graph[nodes[i]].full_name << " {color:#95cde5}" << endl; }
 		else              	{groupB[nodes[i]] = true;} // cerr << graph[nodes[i]].full_name << " {color:#b01700}" << endl; }
 	}
+
+	// Catch error in laplacien calcs
+	if (groupA.size() == 0 || groupB.size() == 0) {
+		throw "Failed to partition subgraph! This might lead to an infinit loop. Please submit the .blastgraph file to lechner at staff.uni-marburg.de to help fixing this issue.";
+	}
+
 	removeExternalEdges(groupA);
 	removeExternalEdges(groupB);
 }
@@ -406,7 +434,8 @@ float getConnectivity(vector<unsigned int>& nodes) {
 	vector<float> norm = nomalize(x_hat, &last_length);
 
 	// Repeat until difference < epsilon
-	while(1) {
+	unsigned int iter = 1000;							// max 1000 iterations (catch huge clustering issues)
+	while(iter-- != 0) { 
 		last_length = current_length;
 		// Get a new x
 		x = get_new_x(norm, nodes, mapping);
@@ -417,8 +446,11 @@ float getConnectivity(vector<unsigned int>& nodes) {
 		// Get lenght (lambda) & normalize vector
 		norm = nomalize(x_hat, &current_length);
 //		cerr << "IT: " << current_length << endl;
-		if (abs(current_length-last_length) < 0.0001) break;
+		if (abs(current_length-last_length) < 0.000333 && iter >= 10) break;	// min 10 iterations (prevent convergence by chance)
 	}
+
+	//	cerr << nodes.size() << " nodes done after " << iter << " iterations" << endl;
+
 	float connectivity = (-current_length+2*max_degree)/(nodes.size());
 
 	// Split groups if connectivity is too low, remove tree like structures that might have arosen
diff --git a/proteinortho5_tree b/proteinortho5_tree
new file mode 100755
index 0000000..96cbbe4
Binary files /dev/null and b/proteinortho5_tree differ

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/proteinortho.git



More information about the debian-med-commit mailing list