[med-svn] [proteinortho] 02/07: Imported Upstream version 5.12+dfsg
Andreas Tille
tille at debian.org
Thu Mar 10 19:55:28 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository proteinortho.
commit d8a5c6c74466dee2d33e858dd7e78451d499dafc
Author: Andreas Tille <tille at debian.org>
Date: Thu Mar 10 20:28:30 2016 +0100
Imported Upstream version 5.12+dfsg
---
Makefile | 7 +-
basic-types.h | 58 ++++++
ffadj_mcs.py | 6 +-
manual.html | 3 +-
po2tree.pl | 161 ++++++++++++++
po_tree | Bin 0 -> 17650 bytes
po_tree.c | 484 +++++++++++++++++++++++++++++++++++++++++++
proteinortho5.pl | 20 +-
proteinortho5_clustering.cpp | 36 +++-
proteinortho5_tree | Bin 0 -> 72584 bytes
10 files changed, 761 insertions(+), 14 deletions(-)
diff --git a/Makefile b/Makefile
index bb54a9d..3c8b861 100644
--- a/Makefile
+++ b/Makefile
@@ -4,15 +4,18 @@ INSTALLDIR=/usr/local/bin
CPP = g++
CPPFLAGS = -Wall -O3
-all: proteinortho5_clustering proteinortho5_clean_edges
+all: proteinortho5_clustering proteinortho5_clean_edges proteinortho5_tree
proteinortho5_clustering: proteinortho5_clustering.cpp
$(CPP) $(CPPFLAGS) -o $@ $<
+proteinortho5_tree: proteinortho5_clustering.cpp
+ $(CPP) $(CPPFLAGS) -o $@ $<
+
proteinortho5_clean_edges: proteinortho5_clean_edges.cpp
$(CPP) $(CPPFLAGS) -o $@ $<
-install: proteinortho5.pl proteinortho5_clustering proteinortho5_clean_edges2.pl proteinortho5_singletons.pl ffadj_mcs.py
+install: proteinortho5.pl proteinortho5_clustering proteinortho5_singletons.pl proteinortho5_clean_edges2.pl ffadj_mcs.py po_tree po_tree.pl
install -v $^ $(INSTALLDIR)
test: proteinortho5.pl proteinortho5_clustering
diff --git a/basic-types.h b/basic-types.h
new file mode 100644
index 0000000..5aa1dcd
--- /dev/null
+++ b/basic-types.h
@@ -0,0 +1,58 @@
+#ifndef BASIC_TYPES_H
+#define BASIC_TYPES_H
+
+
+#ifdef __CYGWIN__
+
+#define CRLF '\r'
+
+#else
+#define CRLF ' '
+#endif
+
+#define MAXBUFFERSIZE 10000
+#define BASEINC 10000
+typedef unsigned char Uchar;
+typedef unsigned int Uint;
+typedef signed int Sint;
+typedef unsigned char BOOL;
+
+#define True '1'
+#define False '0'
+
+typedef struct {
+ int a,
+ b;
+} PairSint;
+
+
+typedef struct {
+ Uint a,
+ b;
+} PairUint;
+
+
+typedef struct {
+ Uint a,
+ b,
+ c;
+} TripleUint;
+
+
+typedef struct {
+ int a,
+ b,
+ c;
+} TripleSint;
+
+typedef struct {
+ int a,
+ b,
+ c,
+ d;
+} QuadSint;
+
+
+
+#endif
+
diff --git a/ffadj_mcs.py b/ffadj_mcs.py
index 4967ffd..485af76 100755
--- a/ffadj_mcs.py
+++ b/ffadj_mcs.py
@@ -842,7 +842,8 @@ if __name__ == '__main__':
alpha = float(argv[2])
edgeThreshold = len(argv) == 4 and float(argv[3]) or 0
- logFileName = '%s.log' %(basename(argv[1]).rsplit('.')[0])
+# logFileName = '%s.log' %(basename(argv[1]).rsplit('.')[0])
+ logFileName = '%s.log' %(argv[1].rsplit('.')[0]) # Respect given path
logging.basicConfig(filename=logFileName,filemode='w', level=logging.INFO,
format= "%(levelname)s\t%(asctime)s\t++ %(message)s")
@@ -876,7 +877,8 @@ if __name__ == '__main__':
# print matching
#
- out_file = basename(argv[1])
+# out_file = basename(argv[1])
+ out_file = argv[1] # respect given path
f = open('%s.matching' %out_file[:out_file.rfind('.')], 'w')
printMatching(g1_mod, g2_mod, g1_runs, hasMultipleChromosomes, f)
f.flush()
diff --git a/manual.html b/manual.html
index 0227a67..fac0228 100644
--- a/manual.html
+++ b/manual.html
@@ -4,7 +4,7 @@
</head>
<body>
<h1>Proteinortho Manual / PoFF Manual</h1>
-<small>This manual corresponds to version 5.11</small>
+<small>This manual corresponds to version 5.12</small>
<h2>Introduction</h2>
Proteinortho is a tool to detect orthologous genes within different species.
For doing so, it compares similarities of given gene sequences and clusters them to find significant groups.
@@ -175,6 +175,7 @@ They are:
</ul></td></tr>
<tr><td>-project=</td><td>myproject</td><td> prefix for all result file names</td></tr>
+<tr><td>-temp=</td><td>working directory</td><td> path for temporary files</td></tr>
<tr><td>-synteny</td><td>-</td><td>activate PoFF extension to separate similar sequences using synteny (requires a GFF file for each FASTA file)</td></tr>
<tr><td>-nograph</td><td>-</td><td>do not generate .graph files with pairwise orthology data<br>saves some time</td></tr>
<tr><td>-keep</td><td>-</td><td>store temporary blast results for reuse</td></tr>
diff --git a/po2tree.pl b/po2tree.pl
new file mode 100755
index 0000000..77d5435
--- /dev/null
+++ b/po2tree.pl
@@ -0,0 +1,161 @@
+#!/usr/bin/perl
+
+##########################################################################################
+# This file is part of proteinortho.
+# (C) 2009 Marcus Lechner
+#
+# proteinortho is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published
+# by the Free Software Foundation; either version 2, or (at your
+# option) any later version.
+#
+# proteinortho is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with proteinortho; see the file COPYING. If not, write to the
+# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+##########################################################################################
+
+##########################################################################################
+# About
+##########################################################################################
+# orthomatrix2tree
+# input ortheset output from proteinortho
+# output corresponding UPGMA tree in newick format
+#
+# branch labels show the number of common genes in the subtrees
+# branch lengths represent the number of new common genes since the last node
+#
+# @author Marcus Lechner
+# @email lechner at staff.uni-marburg.de
+# @company Bioinformatics, University of Leipzig
+# @version 3.10
+# @date 08-12-2015
+#
+##########################################################################################
+
+##########################################################################################
+# Imports
+##########################################################################################
+use strict;
+use warnings "all";
+use File::Basename;
+
+
+##########################################################################################
+# Usage, defaults and parameters
+##########################################################################################
+my $usage= << "JUS";
+ usage: po2tree.pl [OPTIONS] ORTHOMATRIX >OUTTREE
+ input: output from Proteinortho5
+ output: corresponding UPGMA tree in newick format
+ options: -o=[FILE] prints output to the given file rather than STDOUT
+JUS
+
+# path to the C-part of this program
+my @tmppath = fileparse($0);
+our $scriptpath = $tmppath[1];
+
+# Output switch
+my $o = "";
+my $path = "";
+my $file = "";
+
+# No parameters
+unless (defined($ARGV[0])) {
+ print "Error: No input file defined!\n\n$usage";
+ exit 1;
+}
+
+foreach (@ARGV) {
+ if ($_ =~ /^--?o.*=(.+)/) {$o = $1;} # output file
+ elsif ($_ =~ /^-/) { print STDERR "Unknown parameter $_\n$usage"; exit 1;}
+ elsif (!-e $_) { print STDERR "Could not find file $_\n"; exit 1;}
+ else {$file = $_;}
+}
+
+# open the given file, this should be a proteinortho matrix
+open(SOURCE,"<$file") || die("Could not open file $file\n$!");
+# write temp file
+open(TRG,">$file.tmp.matrix2") || die("Could not open tmp-file $file.tmp.matrix2, $!");
+my $did = 0;
+my @viecher = ();
+
+my @present_in_groups;
+# convert the orthoset to a 1/0 matrix
+print STDERR "Preparing datafile...\n";
+while(<SOURCE>) {
+ my @data = split;
+ # first line
+ if ($data[0] =~ /^\#/) {
+ # remove the first elements
+ shift(@data);shift(@data);shift(@data);
+
+ # first line
+ if ($did == 0) {
+ shift(@data);
+ # species names to header
+ @viecher = @data;
+ }
+
+ }
+ else {
+ shift(@data);shift(@data);shift(@data);
+ # any futher line
+ my $i = 0;
+ foreach (@data) {
+ # just transform the entries to 0 and 1
+ # the gene-names are not important here
+ if ($_ eq "*") {
+ print TRG 0;
+ }
+ else {
+ # Count number of copies
+ my $num = 1;
+# $num += $_ =~ /,/g; # count number of copies (not yet respected in C)
+ print TRG $num;
+ $present_in_groups[$i]++; # count group
+ }
+ print TRG " ";
+ $i++;
+ }
+ print TRG "\n";
+ }
+}
+close(SOURCE);
+close(TRG);
+
+open(TMP,">$file.tmp.matrix") || die("Could not open tmp-file $file.tmp.matrix, $!");
+ for (my $i = 0; $i < scalar(@viecher); $i++) {
+ $viecher[$i] =~ s/\..*?$//; # remove point
+ $viecher[$i] =~ s/\s/_/g; # alphabet
+ $viecher[$i] .= "_[$present_in_groups[$i]]"; # add number of groups
+ }
+ print TMP scalar(@viecher)."\n";
+ print TMP join(" ", at viecher)."\n";
+close(TMP);
+system("cat '$file.tmp.matrix2' >>'$file.tmp.matrix'");
+
+# Run the main algorithm in C
+print STDERR "Calculating tree...\n";
+my $run = $scriptpath."po_tree";
+my $out = qx($run $ARGV[0].tmp.matrix);
+$out =~ s/\[.+?\]//g;
+$out =~ s/\s:/:/g;
+
+if ($o ne "") {
+ open(OUT,">$o") || die("Could not write output file $!");
+ print OUT $out;
+ close(OUT);
+}
+else {
+ print $out;
+}
+
+# unlink temporary files
+unlink("$file.tmp.matrix");
+unlink("$file.tmp.matrix2");
diff --git a/po_tree b/po_tree
new file mode 100755
index 0000000..09ca5a2
Binary files /dev/null and b/po_tree differ
diff --git a/po_tree.c b/po_tree.c
new file mode 100644
index 0000000..8b6604d
--- /dev/null
+++ b/po_tree.c
@@ -0,0 +1,484 @@
+/*
+ This file is part of proteinortho.
+ (C) 2009 Marcus Lechner
+
+ proteinortho is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ proteinortho is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with proteinortho; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+*/
+
+/*
+ * TreebuilderC - core part of proteinorthos treebulider
+ * generates a newicktree from proteinortho's affinity matrix
+ *
+ * branchlabels show the number of common genes in the subtrees
+ * branchlengths represent the number of new common genes since the last node
+ *
+ * @author Marcus Lechner, Lydia Steiner
+ * @email marcus at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @version 1.10 (for PO5)
+ * @date 2016-02-22
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "basic-types.h"
+
+BOOL** fetch_matrix(Uint* ccs, Uint* viecher, char* filename, Uint size, char*** pnamen);
+void builder(BOOL** matrix, Uint viecher, Uint ccs, char*** pnamen);
+void merge (Uint* max, Uint css, Uint viecher, BOOL* away, BOOL** matrix, int** aff, char*** pnamen);
+BOOL getmax(BOOL** matrix, BOOL* away, Uint viecher, Uint ccs, int** aff, char*** pnamen);
+int sim(BOOL** matrix, Uint a, Uint b, Uint ccs);
+char* myitoa(int zahl, int base);
+int get_number(char* word);
+
+int main(int argc, char** argv) {
+ /* print error if no filename given */
+ if (argc != 2) {
+ fprintf(stderr,"Error: no filename given.\nUseage: tbc FILENAME\n\n");
+ return 1;
+ }
+
+ /* init variables */
+ /* No. of ccs */
+ Uint ccs;
+ /* No. of species */
+ Uint viecher;
+ /* "string"-array of names, needs to be a pointer for it gets reallocaded by fetch_matrix */
+ char*** pnamen = (char***)malloc(sizeof(char**));
+ /* Just a counter */
+ Uint i;
+
+ fprintf(stderr," >Loading data...");
+ /* read data from file */
+ BOOL** matrix = fetch_matrix(&ccs, &viecher, argv[1], 10000, pnamen);
+ fprintf(stderr,"done\n");
+
+ fprintf(stderr," >Generating tree...");
+ /* Main part: generate the tree from matrix */
+ builder(matrix,viecher,ccs,pnamen);
+ fprintf(stderr,"done\n");
+
+
+ /* substitute every "_" to a whitespace in output tree */
+ i = 0;
+ while ((*pnamen)[0][i] != '\0') {
+ if ((*pnamen)[0][i] == '_') {(*pnamen)[0][i] = ' ';}
+ i++;
+ }
+ /* and print it to stdout */
+ printf("%s;\n",(*pnamen)[0]);
+
+ /* free memory */
+ for(i = 0; i < viecher; i++){free(matrix[i]);}
+ free(matrix);
+// for(i = 0; i < viecher; i++){free((*pnamen)[i]);}
+ /* all other entries where already freed */
+ free((*pnamen)[0]);
+ free(*pnamen);free(pnamen);
+
+ return 0;
+}
+
+/*
+ * generates the tree
+ * matrix - ccs as adjacent matrix (0/1)
+ * viecher - number of species
+ * ccs - number of ccs
+ * pnamen - species names ordered the same way as matrix
+ * returns: nothing, but pnamen[0] contains the tree afterwards
+ */
+void builder(BOOL** matrix, Uint viecher, Uint ccs, char*** pnamen) {
+ /* affinity matrix */
+ int** aff = (int**)malloc(sizeof(int*)*viecher);
+ int i;
+ /* allocate further memory for affinity matrix */
+ for (i = 0; i<viecher; i++) {
+ aff[i] = (int*)malloc(sizeof(int)*viecher);
+ /* initiate every entry with -1 */
+ memset(aff[i], -1, sizeof(int)*viecher);
+ }
+
+ /* array to remember merged species */
+ BOOL away[viecher];
+ /* initiate every entry with 0 */
+ memset(away, 0, sizeof(BOOL)*viecher);
+
+ /* build the tree as long as there are at least 2 species */
+ while (getmax(matrix, away, viecher, ccs, aff, pnamen));
+
+ /* pnamen[0] contains the tree now */
+
+ /* free affinity matrix */
+ for (i = 0; i<viecher; i++) {
+ free(aff[i]);
+ }
+ free(aff);
+}
+
+/*
+ * merges two species in adjacent matrix
+ * only the common features (= 1 enties) stay, rest becomes 0
+ * loss of one species is marked in away-array
+ * need for recalculation of affinity is marked in aff with -1
+ * besides pnamen for the tree is bulided as merged
+ * max - contains to entries which should be merged (the one, with the maximum similarity)
+ * ccs - number of ccs
+ * viecher - number of species
+ * away - vector which tells which entry of matrix has been removed (= marked as removed)
+ * matrix - ccs as adjacent matrix (0/1)
+ * aff - affinity matrix
+ * pnamen - species names ordered the same way as matrix (including merged steps)
+ * returns: nothing, but all involved data-structures represent the merge afterwards correctly
+ */
+void merge (Uint* max, Uint ccs, Uint viecher, BOOL* away, BOOL** matrix, int** aff, char*** pnamen) {
+ int i, counter = 0;
+
+ /* only keep components that are present in both species */
+ /* foreach component */
+ for (i = 0; i < ccs; i++) {
+ /* if species 1 has it */
+ if (matrix[max[0]][i] == '1') {
+ /* give it the value of species 2, */
+ matrix[max[0]][i] = matrix[max[1]][i];
+ }
+ /* now there is only a one, if both had it */
+ }
+
+ /* counter = affinity of the merged species */
+ counter = aff[max[0]][max[1]];
+
+ /* mark the species as away, the entry will not be used again */
+ away[max[1]] = 1;
+ /* mark every entry within the affinity matrix as 'recaculate me' */
+ for (i = 0; i < viecher; i++) {
+ /* only the upper triangle matrix is used */
+ aff[max[0]][i] = -1;
+ /* vice versa not needed */
+ }
+
+ /* fetch the number of proteins the specie's subtrees have in common */
+ int gemein_0 = get_number((*pnamen)[max[0]]);
+ int gemein_1 = get_number((*pnamen)[max[1]]);
+
+ /* transform the number to text
+ the difference with their affinity is the new branch length (= distance) */
+ char* length_0 = myitoa(gemein_0-counter,10);
+ char* length_1 = myitoa(gemein_1-counter,10);
+
+ /* transform the number of common genes (= affinity) to text */
+ char* anz = myitoa(counter,10);
+ /* calculate how many place is needed for the new string
+ sum of both species strings (= subtree) + new numbers + parenteses + : + \0 */
+ int newsize = strlen((*pnamen)[max[0]])+strlen((*pnamen)[max[1]])+strlen(anz)+strlen(length_0)+strlen(length_1)+8;
+ /* and malloc */
+ char* baum = (char*)malloc(sizeof(char)*newsize);
+ /* make it an empty string, so strcat can handle it right */
+ baum[0] = '\0';
+
+ /* bulid the new string (= merged subtrees) */
+ baum = strcat(baum,"(");
+ baum = strcat(baum,(*pnamen)[max[0]]);
+ baum = strcat(baum,":");
+ baum = strcat(baum,length_0);
+ baum = strcat(baum,",");
+ baum = strcat(baum,(*pnamen)[max[1]]);
+ baum = strcat(baum,":");
+ baum = strcat(baum,length_1);
+/ baum = strcat(baum,")");
+// baum = strcat(baum,")[");
+// baum = strcat(baum,anz);
+// baum = strcat(baum,"]");
+
+ /* free the space both merged subtrees */
+ free((*pnamen)[max[0]]);
+ free((*pnamen)[max[1]]);
+ /* store the merged tree in place of the first subtree */
+ (*pnamen)[max[0]] = baum;
+
+ /* free */
+ free(length_0);
+ free(length_1);
+ /* pointer gotten from myitoa */
+ free(anz);
+
+}
+
+
+/*
+ * finds the two species that have the highest similarity and merges them
+ * matrix - ccs as adjacent matrix (0/1)
+ * away - vector which tells which entry of matrix has been removed (= marked as removed)
+ * viecher - number of species
+ * ccs - number of ccs
+ * aff - affinity matrix
+ * pnamen - species names ordered the same way as matrix (including merged steps)
+ * returns: 1 if there was a pair, 0 if there is no species/cluster left to comare (= finished)
+ * pnamen[0] contains the complete tree afterwards
+ */
+BOOL getmax(BOOL** matrix, BOOL* away, Uint viecher, Uint ccs, int** aff, char*** pnamen) {
+ Uint i, j;
+ int k, max = -1;
+
+ /* Storage for the best pair */
+ Uint bestpair[2];
+ bestpair[0] = 0;
+ bestpair[1] = 0;
+
+ /* foreach species 1 (line) */
+ for (i = 0; i < viecher-1; i++) {
+ /* skip the linies with away entries */
+ if (away[i]) {continue;}
+
+ /* foreach species 2 (row) */
+ for (j = i+1; j < viecher; j++) {
+ /* skip the rows with away entries */
+ if (away[j]) {continue;}
+
+ /* recalculate similarity if needed */
+ if (aff[i][j] == -1) {aff[i][j] = sim(matrix,i,j,ccs);}
+
+ /* if similarity is bigger than best until now */
+ k = aff[i][j];
+ if (max < k) {
+ /* store it and the corresponding indices */
+ max = k;
+ bestpair[0] = i;
+ bestpair[1] = j;
+ }
+ }
+ }
+
+ /* if both entries are the same (as initiated), we are done */
+ if (bestpair[0] == bestpair[1]) {
+ /* get the last branch length of the tree */
+ char* anz = myitoa(get_number((*pnamen)[0]),10);
+ /* recalculate size of the tree */
+ int newsize = strlen((*pnamen)[0])+strlen(anz)+2;
+ /* realloc */
+ (*pnamen)[0] = (char*)realloc((*pnamen)[0],sizeof(char)*newsize);
+ /* add the information about the root's branchlength */
+ (*pnamen)[0] = strcat((*pnamen)[0],":");
+ (*pnamen)[0] = strcat((*pnamen)[0],anz);
+ /* free myitoa's string */
+ free(anz);
+ /* return finished */
+ return 0;
+ }
+
+ /* else, merge the pairs */
+ merge(bestpair,ccs,viecher,away,matrix,aff, pnamen);
+
+ /* return done */
+ return 1;
+}
+
+/*
+ * calculates the similarity between two speices
+ * matrix - ccs as adjacent matrix (0/1)
+ * a - species 1
+ * b - species 2
+ * ccs - number of ccs
+ * return - their simialarity (= number of common proteins)
+ */
+int sim(BOOL** matrix, Uint a, Uint b, Uint ccs) {
+ Uint i;
+ int counter = 0;
+ /* foreach connected component */
+ for (i = 0; i < ccs; i++) {
+ /* if both species have an entry, count */
+ if (matrix[a][i] == '1' && matrix[b][i] == '1') counter++;
+ }
+ /* return the sum of common entries */
+ return counter;
+}
+
+/*
+ * fetches the matrix from a file
+ * ccs - number of ccs (will be defined here)
+ * viecher - number of species (will be defined here)
+ * filename- file to fetch the data from
+ * size - initialy allocated size for the the matrix
+ * pnamen - species names ordered the same way as in the returned matrix (will be defined here)
+ * return - pointer to the matrix
+ *
+ * file format:
+ * no. of species
+ * names of species, tab seperated
+ * linewise connected component entries with tab seperated 1 or 0
+ */
+BOOL** fetch_matrix(Uint* ccs, Uint* viecher, char* filename, Uint size, char*** pnamen) {
+ Uint i;
+
+ /* open 0/1 ccs matrix */
+ FILE* datei = fopen (filename, "r");
+ /* read first line = no. of species */
+ fscanf (datei, "%d\n", viecher);
+ /* allocate memory for species names */
+ *pnamen = (char**) malloc(sizeof(char*)*(*viecher));
+ /* read species names and genecount from 2nd line of file */
+ for (i = 0; i < (*viecher); i++) {
+ /* allocate memory for name */
+ /* Attention: dont use strings longer than 255 characters! */
+ (*pnamen)[i] = (char*) malloc(sizeof(char)*255);
+ /* load name in allocated memory */
+ if (i < (*viecher)-1) {
+ /* normal case: */
+ fscanf (datei, "%s ",(*pnamen)[i]);
+ }
+ else {
+ /* special case: last species name is followed by newline */
+ fscanf (datei, "%s \n",(*pnamen)[i]);
+ }
+ /* resize memory to what is needed */
+ (*pnamen)[i] = (char*) realloc((*pnamen)[i],sizeof(char)*(strlen((*pnamen)[i])+1));
+ }
+
+ /* number of connected components is 0 at the beginnig */
+ *ccs = 0;
+ /* allocate memory for 0/1 ccs matrix */
+ BOOL** matrix = (BOOL**) malloc(sizeof(BOOL*)*(*viecher));
+
+ /* Rowwise alloc */
+ for (i = 0; i < (*viecher); i++) {
+ matrix[i] = (BOOL*) malloc(sizeof(BOOL)*size);
+ }
+
+ /* will contain the first char read from line */
+ BOOL first;
+
+ /* read ccs from file, line by line, start with the first char of each line */
+ while(fscanf(datei,"%c ",&first) != EOF){
+ /* if more ccs than allocated memory -> allocate more memory */
+ if(*ccs >= size) {
+ size += 10000;
+ for (i = 0; i < (*viecher); i++) {
+ matrix[i] = (BOOL*) realloc(matrix[i],sizeof(BOOL)*size); // BOOL*
+ }
+ }
+ /* set first number to what was found first */
+ matrix[0][*ccs] = first;
+ /* and gather the rest one by one from file */
+ for (i = 1; i < (*viecher)-1; i++) {
+ /* store char in row */
+ fscanf (datei, "%c ",&(matrix[i][*ccs]));
+ }
+ /* special case: last char is followed by newline */
+ fscanf (datei, "%c \n",&(matrix[(*viecher)-1][*ccs]));
+ /* we have a new cc in memory */
+ (*ccs)++;
+ }
+ /* file read throug, close it */
+ fclose (datei);
+
+ /* resize memory to what is needed for size is likely to be bigger than ccs */
+ for (i = 0; i < (*viecher); i++) {
+ matrix[i] = (BOOL*) realloc(matrix[i],sizeof(BOOL)*(*ccs));
+ }
+
+ /* return the matrix pointer */
+ return matrix;
+}
+
+/*
+ * finds the last [number] entry within a string and converts it to an int
+ * word - the string to scan
+ * return - number within the last [number] pattern
+ */
+int get_number(char* word) {
+ int i, start = -1, ende = -1;
+ /* find start and end of the number */
+ for (i = strlen(word); i >= 0; i--) {
+ if (word[i] == ']') ende = i;
+ if (word[i] == '[') {
+ start = i+1;
+ break;
+ }
+ }
+ /* allocate memory for the number alone */
+ char zahl[ende-start+1];
+ /* copy the numbe to the new string */
+ for (i = 0; i < ende-start; i++) {
+ zahl[i] = word[start+i];
+ }
+ /* add the char finish sign */
+ zahl[ende-start] = '\0';
+ /* return the number as int */
+ return atoi(zahl);
+}
+
+/*
+ * some compilers do not support itoa, so here is something which works for us
+ * converts positive numbers to the corresponding text-form as char-array
+ * Attention: up to 64 signs are supported, base between 1 and 16
+ * if 'x' is included in the output, sth. went wrong
+ * zahl - number to convert (should be postive
+ * base - base to witch number will be converted
+ * returns - char array representing number zahl to base base as text
+ * Attention: returnvalue needs to be freed separately
+ */
+char* myitoa(int zahl, int base) {
+ int i = 0, k;
+ char reverse[64];
+
+ /* Zero is easy, also negative numbers are cought here */
+ if(zahl == 0){
+ char* zero = (char*) malloc(sizeof(char)*2);
+ zero[0] = '0';
+ zero[1] = '\0';
+ return zero;
+ }
+
+ /* convert to a reverse-"string" by modulu-calculation */
+ while (zahl > 0) {
+ switch(zahl%base) {
+ case 0: reverse[i] = '0';break;
+ case 1: reverse[i] = '1';break;
+ case 2: reverse[i] = '2';break;
+ case 3: reverse[i] = '3';break;
+ case 4: reverse[i] = '4';break;
+ case 5: reverse[i] = '5';break;
+ case 6: reverse[i] = '6';break;
+ case 7: reverse[i] = '7';break;
+ case 8: reverse[i] = '8';break;
+ case 9: reverse[i] = '9';break;
+ case 10: reverse[i] = 'A';break;
+ case 11: reverse[i] = 'B';break;
+ case 12: reverse[i] = 'C';break;
+ case 13: reverse[i] = 'D';break;
+ case 14: reverse[i] = 'E';break;
+ case 15: reverse[i] = 'F';break;
+ case 16: reverse[i] = 'G';break;
+ default: reverse[i] = 'x';
+ }
+ /* point one position further */
+ i++;
+ zahl /= base;
+ }
+ /* it's actually one sign less */
+ i--;
+ /* allocate memory for output */
+ char* string = (char*) malloc(sizeof(char)*(i+2));
+ /* reverse to calculated char-array to have the right order of signs */
+ for (k = i; k >= 0; k--) {
+ string[i-k] = reverse[k];
+ }
+ /* it's a string, so finish it with \0 */
+ string[i+1] = '\0';
+ /* and return it */
+ return string;
+}
diff --git a/proteinortho5.pl b/proteinortho5.pl
index 5cf85c1..719e64f 100755
--- a/proteinortho5.pl
+++ b/proteinortho5.pl
@@ -30,7 +30,7 @@
# @author Marcus Lechner
# @email lechner at staff.uni-marburg.de
# @company University of Maruburg
-# @date 2014-07-07
+# @date 2016-02-22
#
##########################################################################################
@@ -47,7 +47,7 @@ use Thread::Queue;
##########################################################################################
# Variables
##########################################################################################
-our $version = "5.11";
+our $version = "5.12";
our $step = 0; # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
our $verbose = 1; # 0/1 -> don't / be verbose
our $debug = 0; # 0/1 -> don't / show debug data
@@ -77,6 +77,7 @@ our $clean = 0;
our $blastOptions = "";
our $nograph = 0;
our $desc = 0;
+our $tmp_path = "";
# Internal
our $blastversion = "unknown"; # Auto-detected blast version
@@ -105,6 +106,10 @@ foreach my $option (@ARGV) {
if ($option =~ m/^--?step=(0|1|2|3)$/) { $step = $1; }
elsif ($option =~ m/^--?verbose$/) { $verbose = 1; }
elsif ($option =~ m/^--?verbose=(0|1)$/) { $verbose = $1; }
+ elsif ($option =~ m/^--?te?mp=(.+)$/) { $tmp_path = $1;
+ # make sure it ends with /
+ unless ($tmp_path =~ /\/$/) {$tmp_path .= "/";}
+ }
elsif ($option =~ m/^--?debug$/) { $debug = 1; }
elsif ($option =~ m/^--?debug=(0|1)$/) { $debug = $1; }
elsif ($option =~ m/^--?p=(.*)$/) { $blastmode = $1; }
@@ -158,8 +163,8 @@ if ($startat != 0 || $stopat != -1) {
our $simgraph = "$project.blast-graph$run_id"; # Output file graph
our $syngraph = "$project.ffadj-graph$run_id"; # Output file synteny
-our $rm_simgraph = "$project.removed_blast-graph"; # Output remove graph
-our $rm_syngraph = "$project.removed_ffadj-graph"; # Output remove graph
+our $rm_simgraph = "$tmp_path$project.removed_blast-graph"; # Output remove graph
+our $rm_syngraph = "$tmp_path$project.removed_ffadj-graph"; # Output remove graph
our $csimgraph = "$project.proteinortho-graph"; # Output file graph
our $csyngraph = "$project.poff-graph"; # Output file synteny
@@ -280,6 +285,7 @@ Options: -e= E-value for blast [default: 1e-05]
-p= blast program {blastn|blastp|blastn+|blastp+}
[default: blastp+]
-project= prefix for all result file names [default: myproject]
+ -temp= path for temporary files [default: working directory]
-synteny activate PoFF extension to separate similar sequences
by contextual adjacencies (requires .gff for each .fasta)
-dups= PoFF: number of reiterations for adjacencies heuristic,
@@ -390,7 +396,7 @@ sub run_blast {
sub workerthread {
my $thread_id = threads->tid();
- my $temp_file = "$project-$run_id-$thread_id";
+ my $temp_file = "$tmp_path$project-$run_id-$thread_id";
# Clean up, just to be safe
unlink("$temp_file.tmp");
@@ -462,7 +468,7 @@ sub workerthread {
open(PREGRAPH,">>$temp_file.tmp") || die("Could not open temp file '$temp_file.tmp': $!");
print PREGRAPH $ordered_matches;
close(PREGRAPH);
- my $cmd = "$po_path/ffadj_mcs.py $temp_file.tmp $alpha";
+ my $cmd = "$po_path/ffadj_mcs.py '$temp_file.tmp' $alpha";
if ($duplication) {
$cmd .= " --repeat-matching $duplication --min-cs-size $cs";
}
@@ -863,7 +869,7 @@ sub blast {
my $b = $_[1];
$a =~ s/^.*\///;
$b =~ s/^.*\///;
- my $bla = "$a.vs.$b.bla";
+ my $bla = "$tmp_path$a.vs.$b.bla";
# File does not exists yet or I am forced to rewrite it
if (!(-s $bla) || $force) {
diff --git a/proteinortho5_clustering.cpp b/proteinortho5_clustering.cpp
index 1691eb4..a7bbba1 100755
--- a/proteinortho5_clustering.cpp
+++ b/proteinortho5_clustering.cpp
@@ -33,6 +33,7 @@ void clear_edges(vector<unsigned int>&);
void partition_graph(void);
void print_group(vector<unsigned int>& , float );
void print_header(void);
+vector<unsigned int> get_deg_one (vector<unsigned int>& );
// Parameters
bool param_verbose = false;
@@ -285,6 +286,16 @@ unsigned int max_deg(vector<unsigned int>& nodes) {
return max;
}
+// Return nodes with degree 1 of given protein_ids
+vector<unsigned int> get_deg_one (vector<unsigned int>& nodes) {
+ vector<unsigned int> one;
+ for (unsigned int i = 0; i < nodes.size(); i++) {
+ unsigned int degree = graph[nodes[i]].edges.size();
+ if (degree == 1) {one.push_back(nodes[i]);}
+ }
+ return one;
+}
+
// Generate random vector x of size size
vector<float> generate_random_vector(const unsigned int size) {
vector<float> x(size);
@@ -378,6 +389,17 @@ void removeExternalEdges(map<unsigned int,bool>& a) {
// Split connected component according to eigenvector
void splitGroups(vector<float>& y, vector<unsigned int>& nodes){
+ // Remove tree like structures in the beginning
+ vector<unsigned int> one = get_deg_one(nodes);
+ if (one.size() > 0) {
+ map<unsigned int,bool> tree;
+ for (unsigned int i = 0; i < one.size(); i++) {
+ {tree[one[i]] = true;}
+ }
+ removeExternalEdges(tree);
+ return;
+ }
+
// Store data about two groups
map<unsigned int,bool> groupA, groupB;
for (unsigned int i = 0; i < y.size(); i++) {
@@ -385,6 +407,12 @@ void splitGroups(vector<float>& y, vector<unsigned int>& nodes){
if (y[i] < 0) {groupA[nodes[i]] = true;} // cerr << graph[nodes[i]].full_name << " {color:#95cde5}" << endl; }
else {groupB[nodes[i]] = true;} // cerr << graph[nodes[i]].full_name << " {color:#b01700}" << endl; }
}
+
+ // Catch error in laplacien calcs
+ if (groupA.size() == 0 || groupB.size() == 0) {
+ throw "Failed to partition subgraph! This might lead to an infinit loop. Please submit the .blastgraph file to lechner at staff.uni-marburg.de to help fixing this issue.";
+ }
+
removeExternalEdges(groupA);
removeExternalEdges(groupB);
}
@@ -406,7 +434,8 @@ float getConnectivity(vector<unsigned int>& nodes) {
vector<float> norm = nomalize(x_hat, &last_length);
// Repeat until difference < epsilon
- while(1) {
+ unsigned int iter = 1000; // max 1000 iterations (catch huge clustering issues)
+ while(iter-- != 0) {
last_length = current_length;
// Get a new x
x = get_new_x(norm, nodes, mapping);
@@ -417,8 +446,11 @@ float getConnectivity(vector<unsigned int>& nodes) {
// Get lenght (lambda) & normalize vector
norm = nomalize(x_hat, ¤t_length);
// cerr << "IT: " << current_length << endl;
- if (abs(current_length-last_length) < 0.0001) break;
+ if (abs(current_length-last_length) < 0.000333 && iter >= 10) break; // min 10 iterations (prevent convergence by chance)
}
+
+ // cerr << nodes.size() << " nodes done after " << iter << " iterations" << endl;
+
float connectivity = (-current_length+2*max_degree)/(nodes.size());
// Split groups if connectivity is too low, remove tree like structures that might have arosen
diff --git a/proteinortho5_tree b/proteinortho5_tree
new file mode 100755
index 0000000..96cbbe4
Binary files /dev/null and b/proteinortho5_tree differ
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/proteinortho.git
More information about the debian-med-commit
mailing list