[med-svn] [Git][med-team/proteinortho][upstream] New upstream version 6.0.28+dfsg
Nilesh Patra
gitlab at salsa.debian.org
Mon Feb 1 10:31:08 GMT 2021
Nilesh Patra pushed to branch upstream at Debian Med / proteinortho
Commits:
6db4d97b by Nilesh Patra at 2021-02-01T15:55:02+05:30
New upstream version 6.0.28+dfsg
- - - - -
5 changed files:
- CHANGELOG
- CHANGEUID
- proteinortho6.pl
- src/proteinortho_clustering.cpp
- src/proteinortho_grab_proteins.pl
Changes:
=====================================
CHANGELOG
=====================================
@@ -287,3 +287,5 @@
proteinortho.summary will be omitted if -nograph is set (there is no graph to generate a summary from)
10. Jan (5379)
last update introduced a bug that removes the *.blast-graph if --step=3 is used
+ 29. Jan (5399)
+ fixed a bug (https://gitlab.com/paulklemm_PHD/proteinortho/-/issues/44) involving the --isoform=trinity option (search pattern was too strict), thanks to Sasha Sh !
\ No newline at end of file
=====================================
CHANGEUID
=====================================
@@ -1 +1 @@
-5379
+5399
=====================================
proteinortho6.pl
=====================================
@@ -464,7 +464,7 @@ use POSIX;
##########################################################################################
# Variables
##########################################################################################
-our $version = "6.0.27";
+our $version = "6.0.28";
our $step = 0; # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
our $verbose = 1; # 0/1 -> don't / be verbose
our $debug = 0; # 0/1 -> don't / show debug data
@@ -500,6 +500,7 @@ our $doxml = 0;
our $desc = 0;
our $tmp_path = "";
our $useMcl = 0;
+$|=1; #autoflush
# Internal
our $blastversion = "unknown"; # Auto-detected blastmode version
@@ -641,7 +642,7 @@ if($selfblast){$checkblast=1;}
$po_path = &get_po_path(); # Determine local path
-our $nucleotideAlphabet="ACGTURYSWKMBDHVNXacgturyswkmbdhvnx\.\-";
+our $nucleotideAlphabet="ACGTNacgtn\.\-"; #ACGTURYSWKMBDHVNXacgturyswkmbdhvnx
our $aminoAlphabet="XOUBZACDEFGHIKLMNPQRSTVWYxoubzacdefghiklmnpqrstvwy\.\*\-";
our %autoblast_fileis;
our $allowedAlphabet = {
@@ -1008,7 +1009,8 @@ sub cluster {
my $cluster_verbose_level = "";
if($verbose == 1){ $cluster_verbose_level = "-verbose 1 "; }
if($verbose == 2){ $cluster_verbose_level = "-debug 1 "; }
- system ("OMP_PROC_BIND=$ompprocbind $po_path/proteinortho_clustering $cluster_verbose_level -minspecies $minspecies -ram ".$freemem_inMB." -kmere ".(1-$exactstep3)." -debug $debug -cpus $cpus -weighted 1 -conn $connectivity -purity $purity $clusterOptions -rmgraph '$rm_simgraph' '$simgraph'* >'$simtable' ".($verbose == 2 ? "" : "2>/dev/null"));
+
+ system ("OMP_PROC_BIND=$ompprocbind $po_path/proteinortho_clustering $cluster_verbose_level -minspecies $minspecies -ram ".$freemem_inMB." -kmere ".(1-$exactstep3)." -debug $debug -cpus $cpus -weighted 1 -conn $connectivity -purity $purity ".($clusterOptions ne "" ? "$clusterOptions" : "" )." -rmgraph '$rm_simgraph' '$simgraph'* >'$simtable' ".($verbose == 2 ? "" : "2>/dev/null"));
if ($? != 0) {
&Error("'proteinortho_clustering' failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code)\nMaybe your operating system does not support the statically compiled version, please try recompiling proteinortho with 'make clean' and 'make' (and 'make install PREFIX=...').");
}
@@ -1024,7 +1026,7 @@ sub cluster {
if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $simtable\n";}
if(scalar @files < 10){
- if($verbose){print STDERR "You can extract the fasta files of each orthology group with\nproteinortho_grab_proteins.pl -tofiles $simtable '".join("' '", at files)."'\n (Careful: This will generate a file foreach line in the file $simtable).\n";}
+ if($verbose){print STDERR "You can extract the fasta files of each orthology group with\nproteinortho_grab_proteins.pl ".($isoform ne "" ? "--isoform" : "")." -tofiles $simtable '".join("' '", at files)."'\n (Careful: This will generate a file foreach line in the file $simtable).\n";}
}
system("(head -n 1 '$simtable' && tail -n +2 '$simtable' | LC_ALL=C sort -k1,1nr -k2,2nr -k3,3nr ) > '$simtable.sort'; mv '$simtable.sort' '$simtable'");
@@ -1072,7 +1074,8 @@ sub cluster {
my $cluster_verbose_level = "";
if($verbose == 1){ $cluster_verbose_level = "-verbose 1 "; }
if($verbose == 2){ $cluster_verbose_level = "-debug 1 "; }
- system ("OMP_PROC_BIND=$ompprocbind $po_path/proteinortho_clustering $cluster_verbose_level $clusterOptions -minspecies $minspecies -ram ".$freemem_inMB." -kmere ".(1-$exactstep3)." -debug $debug -cpus $cpus -weighted 1 -conn $connectivity -purity $purity $clusterOptions -rmgraph '$rm_syngraph' '$syngraph'* >'$syntable' ".($verbose == 2 ? "" : "2>/dev/null"));
+
+ system ("OMP_PROC_BIND=$ompprocbind $po_path/proteinortho_clustering $cluster_verbose_level ".($clusterOptions ne "" ? "$clusterOptions" : "" )." -minspecies $minspecies -ram ".$freemem_inMB." -kmere ".(1-$exactstep3)." -debug $debug -cpus $cpus -weighted 1 -conn $connectivity -purity $purity -rmgraph '$rm_syngraph' '$syngraph'* >'$syntable' ".($verbose == 2 ? "" : "2>/dev/null"));
if ($? != 0) {
&Error("proteinortho_clustering failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code)\nDid you use a static version? Maybe your operating system does not support the static compiled version, please recompile 'make clean' and 'make' or 'make USEPRECOMPILEDLAPACK=FALSE'.");
}
@@ -1084,7 +1087,7 @@ sub cluster {
system("(head -n 1 '$syntable' && tail -n +2 '$syntable' | LC_ALL=C sort -k1,1nr -k2,2nr -k3,3nr ) > '$syntable.sort'; mv '$syntable.sort' '$syntable'");
- if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $syntable\nYou can extract the fasta files of each orthology group with\nproteinortho_grab_proteins.pl -tofiles $syntable '".join("' '", at files)."'\n(Careful: This will generate a file foreach line in the file $syntable).\n";}
+ if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $syntable\nYou can extract the fasta files of each orthology group with\nproteinortho_grab_proteins.pl ".($isoform ne "" ? "--isoform" : "")." -tofiles $syntable '".join("' '", at files)."'\n(Careful: This will generate a file foreach line in the file $syntable).\n";}
if ($singles) {
if($verbose){print STDERR "Adding singles...\n";}
@@ -1156,6 +1159,7 @@ Options:
{autoblast|blastp|blastn|tblastx|blastp_legacy|blastn_legacy|tblastx_legacy|diamond|usearch|ublast|lastp|lastn|rapsearch|topaz|blatp|blatn|mmseqsp|mmseqsn}
The suffix 'p' or 'n' indicates aminoacid fasta files (p) or nucleotide fasta files (n).
The suffix '_legacy' indicates legacy blastall (otherwise blast+ is used).
+ -checkfasta Checks if the given fasta files are compatible with the algorithm of -p
-e= E-value for blast [default: 1e-05]
[Synteny options]
@@ -1226,7 +1230,7 @@ Options:
topaz : Only for protein files!
blat* : Blat family. blatp : For protein files! blatn : For dna files! blatx : For dna files!
mmseqs* : mmseqs family. mmseqsp : For protein files! mmseqsn : For dna files! blatx : For dna files!
-
+ -checkfasta Checks if the given fasta files are compatible with the algorithm of -p
-e= E-value for blast [default: 1e-05]
-selfblast apply selfblast, detects paralogs without orthologs
-sim= min. similarity for additional hits (0..1) [default: 0.95]
@@ -1878,7 +1882,7 @@ sub get_legal_matches {
if($isoform ne ""){
if(exists $isoform_mapping{$query_id} ){ $query_id=$isoform_mapping{$query_id}; }
- if(exists $isoform_mapping{$subject_id} ){ $query_id=$isoform_mapping{$subject_id}; }
+ if(exists $isoform_mapping{$subject_id} ){ $subject_id=$isoform_mapping{$subject_id}; }
}
# It hit itself (only during selfblast)
@@ -2602,7 +2606,7 @@ sub check_bins {
# Check plausibility of files
sub check_files {
if ( ( scalar(@files) == 0 || (scalar(@files) == 1 && $selfblast==0) ) && $step != 3) {&print_usage; &Error("I need at least two files to compare something!");}
- if($verbose){print STDERR "Checking input files";if($checkfasta){print STDERR " very carefully (-check).\n"}else{print STDERR ".\n";}}
+ if($verbose){print STDERR "Checking input files";if($checkfasta){print STDERR " carefully (-checkfasta).\n"}else{print STDERR ".\n";}}
foreach my $file (@files) {
if ($verbose) {print STDERR "Checking $file... ";}
@@ -2664,12 +2668,13 @@ sub read_details {
$isoform_mapping_ncbiuniprot_correction{&convertUniprotAndNCBI($curLine)}=$curLine;
}elsif($isoform eq "trinity"){
- $curLine=~s/_i[0-9]+$//g;
- if($curLine =~ m/^([^ ]+)_i[0-9]+( |$)/){
+
+ if($curLine =~ m/^([^ ]+)_i[0-9]+([^0-9].*|$)/){
my $iso = $1;
$curLine =~ s/[\r\n]+$//;#chomp only removes last \n newline, now also \r are removed and all occurences
$curLine =~ s/^>//;
$curLine =~ s/\s.*//;
+ $iso =~ s/^>//;
$isoform_mapping{$curLine}=$iso;
if($debug){print STDERR "found isoform '$curLine' => '$iso'\n";}
@@ -2780,9 +2785,9 @@ sub read_details {
if($allowedAlphabet->{$blastmode} eq "n" && $cur_gene_is_valid<1 ){
if($cur_gene_is_valid==-1){
- print STDERR ("\n$ORANGE [WARNING]$NC The occurences of ATCGN is less than 50% of in input fasta file '".$file."' in gene '".$lastgenename."'. $blastmode expects nucleotide characters...$NC");
+ print STDERR ("\n$ORANGE [WARNING]$NC The occurences of nucleotide characters (ATCGN) is less than 50% of in input fasta file '".$file."' in entry '".$lastgenename."'. $blastmode expects nucleotide characters...$NC");
}else{
- print STDERR ("\n$ORANGE [WARNING]$NC Found forbidden non-nucleotide character in input fasta file '".$file."' in gene '".$lastgenename."'. $blastmode expects nucleotide characters...$NC");
+ print STDERR ("\n$ORANGE [WARNING]$NC Found forbidden non-nucleotide character in input fasta file '".$file."' in entry '".$lastgenename."'. $blastmode expects nucleotide characters...$NC");
}
if( exists($blastmode_pendant->{$blastmode}) && $restart_counter==0 && $step <2){ # only for step = 0 and step 1 you can do a rerun else the DB are missing
$blastmode = $blastmode_pendant->{$blastmode};
@@ -2798,9 +2803,9 @@ sub read_details {
}elsif($allowedAlphabet->{$blastmode} eq "a" && $cur_gene_is_valid<1 ){
if($cur_gene_is_valid==-1){
- print STDERR ("\n$ORANGE [WARNING]$NC The occurences of ATCGN is greater than 80% of '".$file."' in gene '".$lastgenename."'. $blastmode expects aminoacid characters...$NC");
+ print STDERR ("\n$ORANGE [WARNING]$NC The occurences of nucleotide characters (ATCGN) is greater than 80% of '".$file."' in entry '".$lastgenename."'. $blastmode expects aminoacid characters...$NC");
}else{
- print STDERR ("\$ORANGE [WARNING]$NC Found forbidden non-aminoacid character in input fasta file '$file' in gene '$lastgenename'. $blastmode expects aminoacid characters$NC");
+ print STDERR ("\$ORANGE [WARNING]$NC Found forbidden non-aminoacid character in input fasta file '$file' in entry '$lastgenename'. $blastmode expects aminoacid characters$NC");
}
if(exists($blastmode_pendant->{$blastmode}) && $restart_counter==0 && $step <2){ # only for step = 0 and step 1 you can do a rerun else the DB are missing
=====================================
src/proteinortho_clustering.cpp
=====================================
@@ -1889,7 +1889,7 @@ void splitGroups(vector<floattype>& y, vector<unsigned int>& nodes , bool useLap
cerr << "[WARNING] Failed to partition subgraph with "<<nodes.size()<<" nodes into ("<<groupA.size()<<","<<groupB.size()<<","<<groupZero.size()<<") sized groups using lapack, now reiterating with power iteration." << "\n";
getConnectivity(nodes, false);
}else{
- cerr << "[CRITICAL WARNING] Failed to partition subgraph with "<<nodes.size()<<" nodes into ("<<groupA.size()<<","<<groupB.size()<<","<<groupZero.size()<<") sized groups, now using kmere heuristic as fall-back." << "\n";
+ cerr << "[WARNING] Failed to partition subgraph with "<<nodes.size()<<" nodes into ("<<groupA.size()<<","<<groupB.size()<<","<<groupZero.size()<<") sized groups, now using kmere heuristic as fall-back." << "\n";
fallback_justdokmerenow = true;
goto do_kmereAlgorithm;
}
=====================================
src/proteinortho_grab_proteins.pl
=====================================
@@ -28,8 +28,8 @@
# @author Paul Klemm
# @email klemmp at staff.uni-marburg.de
# @company Bioinformatics, University of Leipzig
-# @version 4
-# @date 3-12-2020
+# @version 5
+# @date 1-29-2021
#
##########################################################################################
@@ -53,6 +53,7 @@ proteinortho_grab_proteins.pl (options) QUERY FASTA1 (FASTA2 ...)
-exact search patters are extended with a \b, that indicates end of word.
-source, -s adds the filename (FASTA1,...) to the found gene-name
-F=s char delimiter for multiple identifier if QUERY is a string input (default: ',')
+ -isoform if you use proteinortho with --isoform option, then you need to set this option here too.
DESCRIPTION
@@ -100,6 +101,7 @@ ENDUSAGE
my $query;
my $help;
my $tofiles=0;
+my $isoform=0;
my $justid;
my $prefix=">";
my $doregex=0;
@@ -117,6 +119,7 @@ for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
elsif($ARGV_copy[$v] =~ m/^--?(source|s)$/){$source=1;}
elsif($ARGV_copy[$v] =~ m/^--?F=(.*)$/){$del=$1;}
elsif($ARGV_copy[$v] =~ m/^--?E$/){$doregex=1;}
+ elsif($ARGV_copy[$v] =~ m/^--?isoform$/){$isoform=1;}
elsif($ARGV_copy[$v] =~ m/^--?exact$/){$exact=1;}
elsif($ARGV_copy[$v] =~ m/^-.+/){print $usage; print STDERR "ERROR: invalid option ".$ARGV_copy[$v]."!\n\n";exit(1);}
elsif(!defined($query)){$query = $ARGV_copy[$v];}
@@ -203,7 +206,8 @@ unless(open(my $FH,'<',$query)) {
}
}
}else{
- if(!$exact){print STDERR "[STDERR] WARNING The -exact option is mandatory if a proteinortho file is given. -exact is now set.\n";$exact=1;}
+ if($isoform && $exact){print STDERR "[STDERR] WARNING The -isoform option is not compatible with -exact if a proteinortho file is given. -exact is now unset.\n";$exact=0;}
+ elsif(!$isoform && !$exact){print STDERR "[STDERR] WARNING The -exact option is mandatory if a proteinortho file is given. -exact is now set.\n";$exact=1;}
if($doregex){print STDERR "[STDERR] WARNING The -E option is not allowed if a proteinortho file is given. -E is now unset.\n";$doregex=0;}
my $query_basename=$query;
@@ -328,7 +332,7 @@ for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
if( $test_match ){
if($qdata{$filename}{$key} eq ""){
- print STDERR "[STDERR] WARNIÂ NG The input ($key) was found multiple times in the fasta files ".(!$exact ? "(maybe try --exact)." : ".")."\n";
+ print STDERR "[STDERR] WARNING The input ($key) was found multiple times in the fasta files ".(!$exact ? "(maybe try --exact)." : ".")."\n";
}
my $headerstr=$curLine;
View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/commit/6db4d97b2bd51ad57b2810a7aa0eb7f89840016d
--
View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/commit/6db4d97b2bd51ad57b2810a7aa0eb7f89840016d
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210201/42aed87d/attachment-0001.html>
More information about the debian-med-commit
mailing list