[med-svn] [Git][med-team/proteinortho][upstream] New upstream version 6.0.25+dfsg
Nilesh Patra
gitlab at salsa.debian.org
Sun Dec 6 14:44:51 GMT 2020
Nilesh Patra pushed to branch upstream at Debian Med / proteinortho
Commits:
36d77e12 by Nilesh Patra at 2020-12-06T20:06:12+05:30
New upstream version 6.0.25+dfsg
- - - - -
4 changed files:
- CHANGELOG
- CHANGEUID
- proteinortho6.pl
- src/proteinortho_grab_proteins.pl
Changes:
=====================================
CHANGELOG
=====================================
@@ -273,3 +273,6 @@
refined the Makefile for proteinortho_clustering, now it additionally tests if the program is executable with the -test option
12. Oct (5122)
enhancement of the Makefile (more verbose, added standard compiler flags, cleanup)
+ 2. Dez (5195)
+ fixing proteinortho_grab_protein.pl (https://gitlab.com/paulklemm_PHD/proteinortho/-/issues/41)
+ and ids that are not found are printed to STDERR
=====================================
CHANGEUID
=====================================
@@ -1 +1 @@
-5122
+5195
=====================================
proteinortho6.pl
=====================================
@@ -464,7 +464,7 @@ use POSIX;
##########################################################################################
# Variables
##########################################################################################
-our $version = "6.0.24";
+our $version = "6.0.25";
our $step = 0; # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
our $verbose = 1; # 0/1 -> don't / be verbose
our $debug = 0; # 0/1 -> don't / show debug data
=====================================
src/proteinortho_grab_proteins.pl
=====================================
@@ -28,8 +28,8 @@
# @author Paul Klemm
# @email klemmp at staff.uni-marburg.de
# @company Bioinformatics, University of Leipzig
-# @version 3
-# @date 30-04-2020
+# @version 4
+# @date 3-12-2020
#
##########################################################################################
@@ -144,36 +144,54 @@ if($fail){
my %qdata;
# my $qdata_count = {};
-my $orthogroupcounter=0;
-my $genecounter=0;
+our $orthogroupcounter=0;
+our $genecounter=0;
+my $numOfFastas=0;
+my $line_i = 0;
-unless(open(my $FH,'<',$query)) {
- if($query eq "-"){
- foreach my $line (<STDIN>)
- {
- $line=~s/[\r\n]+$//;
+for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
+ if($ARGV_copyiddone[$v]){next;}
+ $numOfFastas++;
+}
+our @filenames;
- if(length($line)==0 || $line eq ""){next;}
+my $foundHeader=0;
- my @sp = split(/\t/,$line);
- if(substr($line,0,1) eq "#"){@filenames=@sp; next;}
- if(scalar(@sp)>3){
- for(my $v = 3 ; $v < scalar @sp ; $v++){
- if($sp[$v] eq "*" || $sp[$v] eq ""){next;}
- my @spp = split(",",$sp[$v]);
+# print STDERR (scalar keys %qdata)." vs ".($numOfFastas);
- for(my $vv = 0 ; $vv < scalar @spp ; $vv++){
+sub processLine{
+ my $line = shift;
+ my $prefix = shift;
- if($spp[$vv] eq "*" || $spp[$vv] eq ""){next;}
- $spp[$vv]=~s/^\(//;$spp[$vv]=~s/\)$//;
+ $line=~s/[\r\n]+$//;
- $qdata{"STDIN"}{$spp[$vv]}="STDIN.OrthoGroup".$orthogroupcounter;
- $genecounter++;
- }
+ my @sp = split(/\t/,$line);
+ if(substr($line,0,1) eq "#"){$foundHeader=1;@filenames=@sp; next;}
+ if(scalar(@sp)>3){
+ for(my $v = 3 ; $v < scalar @sp ; $v++){
+ if($sp[$v] eq "*" || $sp[$v] eq ""){next;}
+ my @spp = split(",",$sp[$v]);
+
+ for(my $vv = 0 ; $vv < scalar @spp ; $vv++){
+
+ if($spp[$vv] eq "*" || $spp[$vv] eq ""){next;}
+ $spp[$vv]=~s/^\(//;$spp[$vv]=~s/\)$//;
+
+ if(!exists $filenames[$v]){
+ $filenames[$v]=""
}
+
+ $qdata{$filenames[$v]}{$spp[$vv]}=$prefix.".OrthoGroup".$orthogroupcounter;
+ $genecounter++;
}
- $orthogroupcounter++;
}
+ }
+ $orthogroupcounter++;
+}
+
+unless(open(my $FH,'<',$query)) {
+ if($query eq "-"){
+ foreach my $line (<STDIN>) { &processLine($line, "STDIN") }
}else{
my @sp = split($del,$query);
for(my $v = 0 ; $v < scalar @sp ; $v++){
@@ -193,32 +211,17 @@ unless(open(my $FH,'<',$query)) {
$query_basename=$1;
}
- @filenames;
- while(<$FH>){
- $_=~s/[\r\n]+$//;
- my @sp = split(/\t/,$_);
- if(substr($_,0,1) eq "#"){@filenames=@sp; next;}
- if(scalar(@sp)>3){
- for(my $v = 3 ; $v < scalar @sp ; $v++){
- if($sp[$v] eq "*" || $sp[$v] eq ""){next;}
- my @spp = split(",",$sp[$v]);
-
- for(my $vv = 0 ; $vv < scalar @spp ; $vv++){
-
- if($spp[$vv] eq "*" || $spp[$vv] eq ""){next;}
- $spp[$vv]=~s/^\(//;$spp[$vv]=~s/\)$//;
-
- $qdata{$filenames[$v]}{$spp[$vv]}=$query_basename.".OrthoGroup".$orthogroupcounter;
- $genecounter++;
- }
- }
- }
- $orthogroupcounter++;
- }
+ while(<$FH>){ &processLine($_, $query_basename) }
close($FH);
print STDERR "[STDERR] Done reading the query $query file. Now I know $orthogroupcounter groups with $genecounter genes/proteins in total.\n";
}
+
+if( $foundHeader==0 && $numOfFastas > 3 && $genecounter > 20){
+ print STDERR "\nWARNING : The header of the proteinortho file is missing, this can increase the runtime dramatically. Please include the first line (starting with '#'), to accelerate this program.\n$NC\n";
+ sleep 1;
+}
+
if( $tofiles==1 && ($orthogroupcounter > 100) ){
print STDERR "\n!!!\nWARNING : This call will produce $orthogroupcounter files (one for each orthology group) !\nIn the *.html file you can individually extract single groups by clicking on the front part of a row.\n$NC";
print STDERR "Press 'strg+c' to prevent me from proceeding or wait 20 seconds to continue...\n!!!\n";
@@ -232,20 +235,16 @@ my $cur_gene_filename="";
my %cur_gene_firsttime;
my $genecounterfound=0;
my $basename = "";
-my $numOfFastas=0;
-
-for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
- if($ARGV_copyiddone[$v]){next;}
- $numOfFastas++;
-}
my $fastai=1;
+my %foundIDs;
+
for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
if($ARGV_copyiddone[$v]){next;}
- if($tofiles){print STDERR "[STDERR] ($fastai/$numOfFastas) : ";if($basename ne ""){print STDERR "Done reading $basename. "}print STDERR "Start reading the fasta file ".($ARGV_copy[$v])."\n";}
+ print STDERR "[STDERR] ($fastai/$numOfFastas) : ";if($basename ne ""){print STDERR "Done reading $basename. "}print STDERR "Start reading the fasta file ".($ARGV_copy[$v])."\n";
$fastai++;
$basename = $ARGV_copy[$v];
@@ -297,6 +296,8 @@ for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
$cur_gene_filename=$qdata{$basename}{$genename}.".fasta";
$geneprintswitch = 1;
$genecounterfound++;
+
+ delete $qdata{$basename}{$genename};
}
}else{ # fallback, if the basename (filename) does not exists, try all
@@ -305,20 +306,41 @@ for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
if($geneprintswitch){last;}
foreach my $key (keys %{$qdata{$filename}}) {
my $regexv=$key;
+ my $curLine_test = $curLine;
- if(!$doregex){$regexv=quotemeta($regexv);}
- if($exact){$regexv='\b'.$regexv.'\b';}
+ if(!$doregex && !$exact){$regexv=quotemeta($regexv);}
+
+ my $test_match = 0;
+ if( !$exact ){
+
+ # use regular expression
+ $test_match = $curLine_test =~ $regexv;
+
+ }else{
+
+ # directly compare starting with first 5 character of fasta entry as offset
+ my $offset = 1; # start at 1 -> fasta entries starts with ">"
+ while( !( $test_match = substr($curLine,$offset,length $key) eq $key ) && $offset < 5 ){
+ $offset++;
+ }
+ }
+
+ if( $test_match ){
+
+ if($qdata{$filename}{$key} eq ""){
+ print STDERR "[STDERR] WARNI NG The input ($key) was found multiple times in the fasta files ".(!$exact ? "(maybe try --exact)." : ".")."\n";
+ }
- if( $curLine =~ $regexv ){
-
my $headerstr=$curLine;
if($source){$headerstr=$headerstr." ".$basename;}
$cur_gene.=$headerstr."\n";
- $cur_gene_filename=$qdata{$filename}{$key}.".fasta";
+ $cur_gene_filename = $qdata{$filename}{$key}.".fasta";
$geneprintswitch = 1;
$genecounterfound++;
+ delete $qdata{$filename}{$key};
+
last;
}
}
@@ -352,6 +374,20 @@ if($genecounter != $genecounterfound){
print STDERR "[STDERR] WARNING The input ($query) contains $genecounter queries, but I extracted $genecounterfound entries out of the fasta(s).";
if(!$exact){print STDERR " If this is not desired, please consider using the -exact option";}elsif($genecounter > $genecounterfound){print STDERR "\n-> This should not have happen, maybe some fasta files are missing as input?\n(If you cannot solve this error, please send a report to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com or visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Codes for more help. Further more all mails to lechner\@staff.uni-marburg.de are welcome)\n";}
print STDERR "\n";
+
+ if($genecounterfound < $genecounter){
+ print STDERR "The following ids were not found:\n";
+ my $counter=0;
+ foreach my $filename (keys %qdata) {
+ foreach my $key (keys %{$qdata{$filename}}) {
+ if(10 < $counter++){last}
+ print STDERR $key."\n";
+ }
+ if(10 < $counter){print STDERR " ...\n";last}
+ }
+ print STDERR "\nPlease make sure that the upper ids are part of the given fasta files (try searching these in the given fasta files) !\n";
+ }
+
}else{
print STDERR "[STDERR] All entries of the query are found in the fasta(s).\n";
-}
\ No newline at end of file
+}
View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/commit/36d77e12861ac04e070fbcb0894b9cde8c5c14ad
--
View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/commit/36d77e12861ac04e070fbcb0894b9cde8c5c14ad
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201206/b005c49a/attachment-0001.html>
More information about the debian-med-commit
mailing list