[med-svn] [Git][med-team/proteinortho][upstream] New upstream version 6.0.28+dfsg

Mon Feb 1 10:31:08 GMT 2021


Nilesh Patra pushed to branch upstream at Debian Med / proteinortho


Commits:
6db4d97b by Nilesh Patra at 2021-02-01T15:55:02+05:30
New upstream version 6.0.28+dfsg
- - - - -


5 changed files:

- CHANGELOG
- CHANGEUID
- proteinortho6.pl
- src/proteinortho_clustering.cpp
- src/proteinortho_grab_proteins.pl


Changes:

=====================================
CHANGELOG
=====================================
@@ -287,3 +287,5 @@
 		proteinortho.summary will be omitted if -nograph is set (there is no graph to generate a summary from)
 	10. Jan (5379)
 		last update introduced a bug that removes the *.blast-graph if --step=3 is used
+	29. Jan (5399)
+		fixed a bug (https://gitlab.com/paulklemm_PHD/proteinortho/-/issues/44) involving the --isoform=trinity option (search pattern was too strict), thanks to Sasha Sh !
\ No newline at end of file


=====================================
CHANGEUID
=====================================
@@ -1 +1 @@
-5379
+5399


=====================================
proteinortho6.pl
=====================================
@@ -464,7 +464,7 @@ use POSIX;
 ##########################################################################################
 # Variables
 ##########################################################################################
-our $version = "6.0.27";
+our $version = "6.0.28";
 our $step = 0;    # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
 our $verbose = 1; # 0/1   -> don't / be verbose
 our $debug = 0;   # 0/1   -> don't / show debug data
@@ -500,6 +500,7 @@ our $doxml = 0;
 our $desc = 0;
 our $tmp_path = "";
 our $useMcl = 0;
+$|=1; #autoflush
 
 # Internal
 our $blastversion = "unknown";  # Auto-detected blastmode version
@@ -641,7 +642,7 @@ if($selfblast){$checkblast=1;}
 
 $po_path = &get_po_path();    # Determine local path
 
-our $nucleotideAlphabet="ACGTURYSWKMBDHVNXacgturyswkmbdhvnx\.\-";
+our $nucleotideAlphabet="ACGTNacgtn\.\-"; #ACGTURYSWKMBDHVNXacgturyswkmbdhvnx
 our $aminoAlphabet="XOUBZACDEFGHIKLMNPQRSTVWYxoubzacdefghiklmnpqrstvwy\.\*\-";
 our %autoblast_fileis;
 our $allowedAlphabet = {
@@ -1008,7 +1009,8 @@ sub cluster {
     my $cluster_verbose_level = "";
     if($verbose == 1){ $cluster_verbose_level = "-verbose 1 "; }
     if($verbose == 2){ $cluster_verbose_level = "-debug 1 "; }
-    system ("OMP_PROC_BIND=$ompprocbind $po_path/proteinortho_clustering $cluster_verbose_level -minspecies $minspecies -ram ".$freemem_inMB." -kmere ".(1-$exactstep3)." -debug $debug -cpus $cpus -weighted 1 -conn $connectivity -purity $purity $clusterOptions -rmgraph '$rm_simgraph' '$simgraph'* >'$simtable' ".($verbose == 2 ? "" : "2>/dev/null"));
+
+    system ("OMP_PROC_BIND=$ompprocbind $po_path/proteinortho_clustering $cluster_verbose_level -minspecies $minspecies -ram ".$freemem_inMB." -kmere ".(1-$exactstep3)." -debug $debug -cpus $cpus -weighted 1 -conn $connectivity -purity $purity ".($clusterOptions ne "" ? "$clusterOptions" : "" )." -rmgraph '$rm_simgraph' '$simgraph'* >'$simtable' ".($verbose == 2 ? "" : "2>/dev/null"));
     if ($? != 0) {
           &Error("'proteinortho_clustering' failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code)\nMaybe your operating system does not support the statically compiled version, please try recompiling proteinortho with 'make clean' and 'make' (and 'make install PREFIX=...').");
         }
@@ -1024,7 +1026,7 @@ sub cluster {
 
   if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $simtable\n";}
   if(scalar @files < 10){
-    if($verbose){print STDERR "You can extract the fasta files of each orthology group with\nproteinortho_grab_proteins.pl -tofiles $simtable '".join("' '", at files)."'\n (Careful: This will generate a file foreach line in the file $simtable).\n";}
+    if($verbose){print STDERR "You can extract the fasta files of each orthology group with\nproteinortho_grab_proteins.pl ".($isoform ne "" ? "--isoform" : "")." -tofiles $simtable '".join("' '", at files)."'\n (Careful: This will generate a file foreach line in the file $simtable).\n";}
   }
 
   system("(head -n 1 '$simtable' && tail -n +2 '$simtable' | LC_ALL=C sort -k1,1nr -k2,2nr -k3,3nr ) > '$simtable.sort'; mv '$simtable.sort' '$simtable'");
@@ -1072,7 +1074,8 @@ sub cluster {
       my $cluster_verbose_level = "";
       if($verbose == 1){ $cluster_verbose_level = "-verbose 1 "; }
       if($verbose == 2){ $cluster_verbose_level = "-debug 1 "; }
-      system ("OMP_PROC_BIND=$ompprocbind $po_path/proteinortho_clustering $cluster_verbose_level $clusterOptions -minspecies $minspecies -ram ".$freemem_inMB." -kmere ".(1-$exactstep3)." -debug $debug -cpus $cpus -weighted 1 -conn $connectivity -purity $purity $clusterOptions -rmgraph '$rm_syngraph' '$syngraph'* >'$syntable' ".($verbose == 2 ? "" : "2>/dev/null"));
+
+      system ("OMP_PROC_BIND=$ompprocbind $po_path/proteinortho_clustering $cluster_verbose_level ".($clusterOptions ne "" ? "$clusterOptions" : "" )." -minspecies $minspecies -ram ".$freemem_inMB." -kmere ".(1-$exactstep3)." -debug $debug -cpus $cpus -weighted 1 -conn $connectivity -purity $purity -rmgraph '$rm_syngraph' '$syngraph'* >'$syntable' ".($verbose == 2 ? "" : "2>/dev/null"));
       if ($? != 0) {
         &Error("proteinortho_clustering failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code)\nDid you use a static version? Maybe your operating system does not support the static compiled version, please recompile 'make clean' and 'make' or 'make USEPRECOMPILEDLAPACK=FALSE'.");
       }
@@ -1084,7 +1087,7 @@ sub cluster {
 
     system("(head -n 1 '$syntable' && tail -n +2 '$syntable' | LC_ALL=C sort -k1,1nr -k2,2nr -k3,3nr ) > '$syntable.sort'; mv '$syntable.sort' '$syntable'");
 
-    if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $syntable\nYou can extract the fasta files of each orthology group with\nproteinortho_grab_proteins.pl -tofiles $syntable '".join("' '", at files)."'\n(Careful: This will generate a file foreach line in the file $syntable).\n";}
+    if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $syntable\nYou can extract the fasta files of each orthology group with\nproteinortho_grab_proteins.pl ".($isoform ne "" ? "--isoform" : "")." -tofiles $syntable '".join("' '", at files)."'\n(Careful: This will generate a file foreach line in the file $syntable).\n";}
 
     if ($singles) {
       if($verbose){print STDERR "Adding singles...\n";}
@@ -1156,6 +1159,7 @@ Options:
                       {autoblast|blastp|blastn|tblastx|blastp_legacy|blastn_legacy|tblastx_legacy|diamond|usearch|ublast|lastp|lastn|rapsearch|topaz|blatp|blatn|mmseqsp|mmseqsn}
                       The suffix 'p' or 'n' indicates aminoacid fasta files (p) or nucleotide fasta files (n).
                       The suffix '_legacy' indicates legacy blastall (otherwise blast+ is used).
+         -checkfasta  Checks if the given fasta files are compatible with the algorithm of -p
          -e=          E-value for blast [default: 1e-05]
 
          [Synteny options]
@@ -1226,7 +1230,7 @@ Options:
                         topaz : Only for protein files!
                         blat* : Blat family. blatp : For protein files! blatn : For dna files! blatx : For dna files!
                         mmseqs* : mmseqs family. mmseqsp : For protein files! mmseqsn : For dna files! blatx : For dna files!
-
+         -checkfasta  Checks if the given fasta files are compatible with the algorithm of -p
          -e=          E-value for blast [default: 1e-05]
          -selfblast   apply selfblast, detects paralogs without orthologs
          -sim=        min. similarity for additional hits (0..1) [default: 0.95]
@@ -1878,7 +1882,7 @@ sub get_legal_matches {
 
     if($isoform ne ""){
       if(exists $isoform_mapping{$query_id} ){ $query_id=$isoform_mapping{$query_id}; }
-      if(exists $isoform_mapping{$subject_id} ){ $query_id=$isoform_mapping{$subject_id}; }
+      if(exists $isoform_mapping{$subject_id} ){ $subject_id=$isoform_mapping{$subject_id}; }
     }
    
     # It hit itself (only during selfblast)
@@ -2602,7 +2606,7 @@ sub check_bins {
 # Check plausibility of files
 sub check_files {
   if ( ( scalar(@files) == 0 || (scalar(@files) == 1 && $selfblast==0) ) && $step != 3)   {&print_usage; &Error("I need at least two files to compare something!");}
-  if($verbose){print STDERR "Checking input files";if($checkfasta){print STDERR " very carefully (-check).\n"}else{print STDERR ".\n";}}
+  if($verbose){print STDERR "Checking input files";if($checkfasta){print STDERR " carefully (-checkfasta).\n"}else{print STDERR ".\n";}}
 
   foreach my $file (@files) {
     if ($verbose) {print STDERR "Checking $file... ";}
@@ -2664,12 +2668,13 @@ sub read_details {
         $isoform_mapping_ncbiuniprot_correction{&convertUniprotAndNCBI($curLine)}=$curLine;
 
       }elsif($isoform eq "trinity"){
-        $curLine=~s/_i[0-9]+$//g;
-        if($curLine =~ m/^([^ ]+)_i[0-9]+( |$)/){
+
+        if($curLine =~ m/^([^ ]+)_i[0-9]+([^0-9].*|$)/){
           my $iso = $1;
           $curLine =~ s/[\r\n]+$//;#chomp only removes last \n newline, now also \r are removed and all occurences
           $curLine =~ s/^>//;
           $curLine =~ s/\s.*//;
+          $iso =~ s/^>//;
           $isoform_mapping{$curLine}=$iso;
 
           if($debug){print STDERR "found isoform '$curLine' => '$iso'\n";}
@@ -2780,9 +2785,9 @@ sub read_details {
     if($allowedAlphabet->{$blastmode} eq "n" && $cur_gene_is_valid<1 ){
 
       if($cur_gene_is_valid==-1){
-       print STDERR ("\n$ORANGE [WARNING]$NC The occurences of ATCGN is less than 50% of in input fasta file '".$file."' in gene '".$lastgenename."'. $blastmode expects nucleotide characters...$NC");
+       print STDERR ("\n$ORANGE [WARNING]$NC The occurences of nucleotide characters (ATCGN) is less than 50% of in input fasta file '".$file."' in entry '".$lastgenename."'. $blastmode expects nucleotide characters...$NC");
       }else{
-        print STDERR ("\n$ORANGE [WARNING]$NC Found forbidden non-nucleotide character in input fasta file '".$file."' in gene '".$lastgenename."'. $blastmode expects nucleotide characters...$NC");
+        print STDERR ("\n$ORANGE [WARNING]$NC Found forbidden non-nucleotide character in input fasta file '".$file."' in entry '".$lastgenename."'. $blastmode expects nucleotide characters...$NC");
       }
       if( exists($blastmode_pendant->{$blastmode}) && $restart_counter==0 && $step <2){ # only for step = 0 and step 1 you can do a rerun else the DB are missing
         $blastmode = $blastmode_pendant->{$blastmode};
@@ -2798,9 +2803,9 @@ sub read_details {
     }elsif($allowedAlphabet->{$blastmode} eq "a" && $cur_gene_is_valid<1 ){
 
       if($cur_gene_is_valid==-1){
-        print STDERR ("\n$ORANGE [WARNING]$NC The occurences of ATCGN is greater than 80% of '".$file."' in gene '".$lastgenename."'. $blastmode expects aminoacid characters...$NC");
+        print STDERR ("\n$ORANGE [WARNING]$NC The occurences of nucleotide characters (ATCGN) is greater than 80% of '".$file."' in entry '".$lastgenename."'. $blastmode expects aminoacid characters...$NC");
       }else{
-        print STDERR ("\$ORANGE [WARNING]$NC Found forbidden non-aminoacid character in input fasta file '$file' in gene '$lastgenename'. $blastmode expects aminoacid characters$NC");
+        print STDERR ("\$ORANGE [WARNING]$NC Found forbidden non-aminoacid character in input fasta file '$file' in entry '$lastgenename'. $blastmode expects aminoacid characters$NC");
       }
 
       if(exists($blastmode_pendant->{$blastmode}) && $restart_counter==0 && $step <2){ # only for step = 0 and step 1 you can do a rerun else the DB are missing


=====================================
src/proteinortho_clustering.cpp
=====================================
@@ -1889,7 +1889,7 @@ void splitGroups(vector<floattype>& y, vector<unsigned int>& nodes , bool useLap
 				cerr << "[WARNING]   Failed to partition subgraph with "<<nodes.size()<<" nodes into ("<<groupA.size()<<","<<groupB.size()<<","<<groupZero.size()<<") sized groups using lapack, now reiterating with power iteration." << "\n";
 				getConnectivity(nodes, false);
 			}else{
-				cerr << "[CRITICAL WARNING]   Failed to partition subgraph with "<<nodes.size()<<" nodes into ("<<groupA.size()<<","<<groupB.size()<<","<<groupZero.size()<<") sized groups, now using kmere heuristic as fall-back." << "\n";
+				cerr << "[WARNING]   Failed to partition subgraph with "<<nodes.size()<<" nodes into ("<<groupA.size()<<","<<groupB.size()<<","<<groupZero.size()<<") sized groups, now using kmere heuristic as fall-back." << "\n";
 				fallback_justdokmerenow = true;
 				goto do_kmereAlgorithm;
 			}


=====================================
src/proteinortho_grab_proteins.pl
=====================================
@@ -28,8 +28,8 @@
 # @author Paul Klemm
 # @email klemmp at staff.uni-marburg.de
 # @company Bioinformatics, University of Leipzig
-# @version 4
-# @date 3-12-2020
+# @version 5
+# @date 1-29-2021
 #
 ##########################################################################################
 
@@ -53,6 +53,7 @@ proteinortho_grab_proteins.pl (options) QUERY FASTA1 (FASTA2 ...)
 		-exact        search patters are extended with a \b, that indicates end of word.
 		-source, -s   adds the filename (FASTA1,...) to the found gene-name
 		-F=s          char delimiter for multiple identifier if QUERY is a string input (default: ',')
+		-isoform      if you use proteinortho with --isoform option, then you need to set this option here too. 
 
 DESCRIPTION
  
@@ -100,6 +101,7 @@ ENDUSAGE
 my $query;
 my $help;
 my $tofiles=0;
+my $isoform=0;
 my $justid;
 my $prefix=">";
 my $doregex=0;
@@ -117,6 +119,7 @@ for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
 	elsif($ARGV_copy[$v] =~ m/^--?(source|s)$/){$source=1;}
 	elsif($ARGV_copy[$v] =~ m/^--?F=(.*)$/){$del=$1;}
 	elsif($ARGV_copy[$v] =~ m/^--?E$/){$doregex=1;}
+	elsif($ARGV_copy[$v] =~ m/^--?isoform$/){$isoform=1;}
 	elsif($ARGV_copy[$v] =~ m/^--?exact$/){$exact=1;}
 	elsif($ARGV_copy[$v] =~ m/^-.+/){print $usage; print STDERR "ERROR: invalid option ".$ARGV_copy[$v]."!\n\n";exit(1);}
 	elsif(!defined($query)){$query = $ARGV_copy[$v];}
@@ -203,7 +206,8 @@ unless(open(my $FH,'<',$query)) {
 		}
 	}
 }else{
-	if(!$exact){print STDERR "[STDERR] WARNING The -exact option is mandatory if a proteinortho file is given. -exact is now set.\n";$exact=1;}
+	if($isoform && $exact){print STDERR "[STDERR] WARNING The -isoform option is not compatible with -exact if a proteinortho file is given. -exact is now unset.\n";$exact=0;}
+	elsif(!$isoform && !$exact){print STDERR "[STDERR] WARNING The -exact option is mandatory if a proteinortho file is given. -exact is now set.\n";$exact=1;}
 	if($doregex){print STDERR "[STDERR] WARNING The -E option is not allowed if a proteinortho file is given. -E is now unset.\n";$doregex=0;}
 
 	my $query_basename=$query;
@@ -328,7 +332,7 @@ for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
 						if( $test_match ){
 
 							if($qdata{$filename}{$key} eq ""){
-								print STDERR "[STDERR] WARNIÂ NG The input ($key) was found multiple times in the fasta files ".(!$exact ? "(maybe try --exact)." : ".")."\n";
+								print STDERR "[STDERR] WARNING The input ($key) was found multiple times in the fasta files ".(!$exact ? "(maybe try --exact)." : ".")."\n";
 							}
 
 							my $headerstr=$curLine;



View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/commit/6db4d97b2bd51ad57b2810a7aa0eb7f89840016d

-- 
View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/commit/6db4d97b2bd51ad57b2810a7aa0eb7f89840016d
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210201/42aed87d/attachment-0001.html>