[med-svn] [Git][med-team/proteinortho][upstream] New upstream version 6.0.25+dfsg

Sun Dec 6 14:44:51 GMT 2020


Nilesh Patra pushed to branch upstream at Debian Med / proteinortho


Commits:
36d77e12 by Nilesh Patra at 2020-12-06T20:06:12+05:30
New upstream version 6.0.25+dfsg
- - - - -


4 changed files:

- CHANGELOG
- CHANGEUID
- proteinortho6.pl
- src/proteinortho_grab_proteins.pl


Changes:

=====================================
CHANGELOG
=====================================
@@ -273,3 +273,6 @@
 		refined the Makefile for proteinortho_clustering, now it additionally tests if the program is executable with the -test option		
 	12. Oct (5122)
 		enhancement of the Makefile (more verbose, added standard compiler flags, cleanup)
+	2. Dez (5195)
+		fixing proteinortho_grab_protein.pl (https://gitlab.com/paulklemm_PHD/proteinortho/-/issues/41)
+		and ids that are not found are printed to STDERR 


=====================================
CHANGEUID
=====================================
@@ -1 +1 @@
-5122
+5195


=====================================
proteinortho6.pl
=====================================
@@ -464,7 +464,7 @@ use POSIX;
 ##########################################################################################
 # Variables
 ##########################################################################################
-our $version = "6.0.24";
+our $version = "6.0.25";
 our $step = 0;    # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
 our $verbose = 1; # 0/1   -> don't / be verbose
 our $debug = 0;   # 0/1   -> don't / show debug data


=====================================
src/proteinortho_grab_proteins.pl
=====================================
@@ -28,8 +28,8 @@
 # @author Paul Klemm
 # @email klemmp at staff.uni-marburg.de
 # @company Bioinformatics, University of Leipzig
-# @version 3
-# @date 30-04-2020
+# @version 4
+# @date 3-12-2020
 #
 ##########################################################################################
 
@@ -144,36 +144,54 @@ if($fail){
 my %qdata;
 # my $qdata_count = {};
 
-my $orthogroupcounter=0;
-my $genecounter=0;
+our $orthogroupcounter=0;
+our $genecounter=0;
+my $numOfFastas=0;
+my $line_i = 0;
 
-unless(open(my $FH,'<',$query)) {
-	if($query eq "-"){
-		foreach my $line (<STDIN>) 
-		{
-			$line=~s/[\r\n]+$//;
+for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
+	if($ARGV_copyiddone[$v]){next;}
+	$numOfFastas++;
+}
+our @filenames;
 
-			if(length($line)==0 || $line eq ""){next;}
+my $foundHeader=0;
 
-			my @sp = split(/\t/,$line);
-			if(substr($line,0,1) eq "#"){@filenames=@sp; next;}
-			if(scalar(@sp)>3){
-				for(my $v = 3 ; $v < scalar @sp ; $v++){
-					if($sp[$v] eq "*" || $sp[$v] eq ""){next;}
-					my @spp = split(",",$sp[$v]);
+# print STDERR (scalar keys %qdata)." vs ".($numOfFastas);
 
-					for(my $vv = 0 ; $vv < scalar @spp ; $vv++){
+sub processLine{
+	my $line = shift;
+	my $prefix = shift;
 
-						if($spp[$vv] eq "*" || $spp[$vv] eq ""){next;}
-						$spp[$vv]=~s/^\(//;$spp[$vv]=~s/\)$//;
+	$line=~s/[\r\n]+$//; 
 
-						$qdata{"STDIN"}{$spp[$vv]}="STDIN.OrthoGroup".$orthogroupcounter;
-						$genecounter++;
-					}
+	my @sp = split(/\t/,$line);
+	if(substr($line,0,1) eq "#"){$foundHeader=1;@filenames=@sp; next;}
+	if(scalar(@sp)>3){
+		for(my $v = 3 ; $v < scalar @sp ; $v++){
+			if($sp[$v] eq "*" || $sp[$v] eq ""){next;}
+			my @spp = split(",",$sp[$v]);
+
+			for(my $vv = 0 ; $vv < scalar @spp ; $vv++){
+
+				if($spp[$vv] eq "*" || $spp[$vv] eq ""){next;}
+				$spp[$vv]=~s/^\(//;$spp[$vv]=~s/\)$//;
+
+				if(!exists $filenames[$v]){
+					$filenames[$v]=""
 				}
+
+				$qdata{$filenames[$v]}{$spp[$vv]}=$prefix.".OrthoGroup".$orthogroupcounter;
+				$genecounter++;
 			}
-			$orthogroupcounter++;
 		}
+	}
+	$orthogroupcounter++;
+}
+
+unless(open(my $FH,'<',$query)) {
+	if($query eq "-"){
+		foreach my $line (<STDIN>) { &processLine($line, "STDIN") }
 	}else{
 		my @sp = split($del,$query); 
 		for(my $v = 0 ; $v < scalar @sp ; $v++){
@@ -193,32 +211,17 @@ unless(open(my $FH,'<',$query)) {
 		$query_basename=$1;
 	}
 
-	@filenames;
-	while(<$FH>){
-		$_=~s/[\r\n]+$//;
-		my @sp = split(/\t/,$_);
-		if(substr($_,0,1) eq "#"){@filenames=@sp; next;}
-		if(scalar(@sp)>3){
-			for(my $v = 3 ; $v < scalar @sp ; $v++){
-				if($sp[$v] eq "*" || $sp[$v] eq ""){next;}
-				my @spp = split(",",$sp[$v]);
-
-				for(my $vv = 0 ; $vv < scalar @spp ; $vv++){
-
-					if($spp[$vv] eq "*" || $spp[$vv] eq ""){next;}
-					$spp[$vv]=~s/^\(//;$spp[$vv]=~s/\)$//;
-
-					$qdata{$filenames[$v]}{$spp[$vv]}=$query_basename.".OrthoGroup".$orthogroupcounter;
-					$genecounter++;
-				}
-			}
-		}
-		$orthogroupcounter++;
-	}
+	while(<$FH>){ &processLine($_, $query_basename) }
 	close($FH);
 	print STDERR "[STDERR] Done reading the query $query file. Now I know $orthogroupcounter groups with $genecounter genes/proteins in total.\n";
 }
 
+
+if( $foundHeader==0 && $numOfFastas > 3 && $genecounter > 20){
+	print STDERR "\nWARNING : The header of the proteinortho file is missing, this can increase the runtime dramatically. Please include the first line (starting with '#'), to accelerate this program.\n$NC\n";
+	sleep 1;
+}
+
 if( $tofiles==1 && ($orthogroupcounter > 100) ){
 	print STDERR "\n!!!\nWARNING : This call will produce $orthogroupcounter files (one for each orthology group) !\nIn the *.html file you can individually extract single groups by clicking on the front part of a row.\n$NC";
 	print STDERR "Press 'strg+c' to prevent me from proceeding or wait 20 seconds to continue...\n!!!\n";
@@ -232,20 +235,16 @@ my $cur_gene_filename="";
 my %cur_gene_firsttime;
 my $genecounterfound=0;
 my $basename = "";
-my $numOfFastas=0;
-
-for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
-	if($ARGV_copyiddone[$v]){next;}
-	$numOfFastas++;
-}
 
 my $fastai=1;
 
+my %foundIDs;
+
 for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
 
 	if($ARGV_copyiddone[$v]){next;}
 
-	if($tofiles){print STDERR "[STDERR] ($fastai/$numOfFastas) : ";if($basename ne ""){print STDERR "Done reading $basename. "}print STDERR "Start reading the fasta file ".($ARGV_copy[$v])."\n";}
+	print STDERR "[STDERR] ($fastai/$numOfFastas) : ";if($basename ne ""){print STDERR "Done reading $basename. "}print STDERR "Start reading the fasta file ".($ARGV_copy[$v])."\n";
 	$fastai++;
 
 	$basename = $ARGV_copy[$v];
@@ -297,6 +296,8 @@ for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
 					$cur_gene_filename=$qdata{$basename}{$genename}.".fasta";
 					$geneprintswitch = 1;
 					$genecounterfound++;
+
+					delete $qdata{$basename}{$genename};
 				}
 
 			}else{ # fallback, if the basename (filename) does not exists, try all 
@@ -305,20 +306,41 @@ for(my $v = 0 ; $v < scalar @ARGV_copy ; $v++){
 					if($geneprintswitch){last;}
 					foreach my $key (keys %{$qdata{$filename}}) { 
 						my $regexv=$key; 
+						my $curLine_test = $curLine;
 						
-						if(!$doregex){$regexv=quotemeta($regexv);}
-						if($exact){$regexv='\b'.$regexv.'\b';}
+						if(!$doregex && !$exact){$regexv=quotemeta($regexv);}
+
+						my $test_match = 0;
+						if( !$exact ){
+							
+							# use regular expression
+							$test_match = $curLine_test =~ $regexv;
+
+						}else{
+							
+							# directly compare starting with first 5 character of fasta entry as offset
+							my $offset = 1; # start at 1 -> fasta entries starts with ">"
+							while( !( $test_match = substr($curLine,$offset,length $key) eq $key ) && $offset < 5 ){
+								$offset++;
+							} 
+						}
+
+						if( $test_match ){
+
+							if($qdata{$filename}{$key} eq ""){
+								print STDERR "[STDERR] WARNI NG The input ($key) was found multiple times in the fasta files ".(!$exact ? "(maybe try --exact)." : ".")."\n";
+							}
 
-						if( $curLine =~ $regexv ){
-									
 							my $headerstr=$curLine;
 							if($source){$headerstr=$headerstr." ".$basename;}
 
 							$cur_gene.=$headerstr."\n";	
-							$cur_gene_filename=$qdata{$filename}{$key}.".fasta";
+							$cur_gene_filename = $qdata{$filename}{$key}.".fasta";
 							$geneprintswitch = 1;
 							$genecounterfound++;
 
+							delete $qdata{$filename}{$key};
+
 							last;
 						}
 					}
@@ -352,6 +374,20 @@ if($genecounter != $genecounterfound){
 	print STDERR "[STDERR] WARNING The input ($query) contains $genecounter queries, but I extracted $genecounterfound entries out of the fasta(s).";
 	if(!$exact){print STDERR " If this is not desired, please consider using the -exact option";}elsif($genecounter > $genecounterfound){print STDERR "\n-> This should not have happen, maybe some fasta files are missing as input?\n(If you cannot solve this error, please send a report to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com or visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Codes for more help. Further more all mails to lechner\@staff.uni-marburg.de are welcome)\n";}
 	print STDERR "\n";
+
+	if($genecounterfound < $genecounter){
+		print STDERR "The following ids were not found:\n";
+		my $counter=0;
+		foreach my $filename (keys %qdata) { 
+			foreach my $key (keys %{$qdata{$filename}}) { 
+				if(10 < $counter++){last}
+				print STDERR $key."\n"; 
+			}
+			if(10 < $counter){print STDERR " ...\n";last}
+		}
+		print STDERR "\nPlease make sure that the upper ids are part of the given fasta files (try searching these in the given fasta files) !\n";
+	}
+	
 }else{
 	print STDERR "[STDERR] All entries of the query are found in the fasta(s).\n";
-}
\ No newline at end of file
+}



View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/commit/36d77e12861ac04e070fbcb0894b9cde8c5c14ad

-- 
View it on GitLab: https://salsa.debian.org/med-team/proteinortho/-/commit/36d77e12861ac04e070fbcb0894b9cde8c5c14ad
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201206/b005c49a/attachment-0001.html>