[med-svn] [Git][med-team/placnet][upstream] New upstream version 1.04

Tue Sep 28 09:08:49 BST 2021


Andreas Tille pushed to branch upstream at Debian Med / placnet


Commits:
e8962f52 by Andreas Tille at 2021-09-28T09:35:36+02:00
New upstream version 1.04
- - - - -


3 changed files:

- + CHANGES.txt
- + makeRefDB.pl
- placnet.pl


Changes:

=====================================
CHANGES.txt
=====================================
@@ -0,0 +1,4 @@
+VERSION 1.04
+
+- Fixed placnet to work with new NCBI specifications (remove GI identifiers)
+- Added new script to download and format the Reference Database for Placnet.


=====================================
makeRefDB.pl
=====================================
@@ -0,0 +1,94 @@
+#!/usr/bin/perl
+
+
+########################################################################
+# Perl scritp for download the placnet RefDB database of genomes and   #
+# plasmids from NCBI databases. Script download all complete genomes   #
+# from RefSeq bacteria and all isolate Plamids (whitout associated     #
+# chromosome). Additionally script create a headersRefDB.txt file to   #
+# import description information in Placnet networks                   #
+#                                                                      #
+#                                                                      #
+# Just run: ./makeRefDB                                                #
+#                                                                      #
+# outputs: RefDB.XX.nXX (blast nucleotide database)                    #
+#          headersRefDB.txt (TAB file with genome description)         #
+########################################################################
+
+print("\n\nDownloading index of RefSeq Bacteria Database\n");
+system("wget -nv --show-progress ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt");
+
+open(SUM,"assembly_summary.txt");
+open(OUT,">down.list");
+ at summary = <SUM>;
+
+ at complete = grep(/Complete Genome/, at summary);
+
+foreach $l (@complete)
+{
+	chomp $l;
+	@c = split(/\t/,$l);
+	@c2 = split(/\//,$c[19]);
+	print OUT "$c[-1]/$c2[-1]_genomic.fna.gz\n";	
+}
+close SUM;
+close OUT;
+
+print ("\nDownloading complete genomes...\n");
+system("wget -nv --show-progress -i down.list");
+
+print ("\nDownloading complete plasmids...\n");
+system("wget ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plasmid/*genomic.fna*");
+
+print("Decompressing files...\n");
+system("gzip -d *.gz");
+system("cat plasmid*.fna > all_plasmid_tmp.fna");
+system("grep '>' GC*fna | cut -f2 --delimiter='>' | cut -f1 --delimiter=' ' > acc.txt");
+
+####removing duplicates between plasmids.*.fna and GCA_*.fna
+open(A,"acc.txt");
+ at acc = <A>;
+close A;
+
+foreach $l (@acc)
+{
+	chomp $l;
+	$hash{$l} =1;
+}
+
+
+$prt =1;
+open(F,"all_plasmid_tmp.fna");
+open(O,">all_plasmid_nr.fna");
+while ($l = <F>)
+{
+	if($l =~ />/)
+	{
+		@c = split(/\|/,$l);
+		if (exists($hash{$c[3]}))
+		{
+			$prt = 0;
+		}else{
+			$prt = 1;
+		}
+	}
+	
+	if($prt ==1)
+	{
+			print O $l;
+	}
+	
+}
+
+
+system("cat GC* all_plasmid_nr.fna > all.fasta");
+
+print ("Making Blast Datadase...\n");
+system("sed -i 's/>/>refDB|/' all.fasta");
+system("makeblastdb -in all.fasta -out RefDB -dbtype nucl");
+system("grep '>' all.fasta | sed 's/ /\t/' | sed 's/>//' > headersRefDB.txt");
+
+#system("rm all_plasmid_tmp.fna acc.txt plasmid*.fna");
+
+
+print("\n\nFINISHED\n");


=====================================
placnet.pl
=====================================
@@ -1,5 +1,5 @@
 #!/usr/bin/perl  
-#v1.03
+#v1.04
 
 use strict;
 #use warnings;
@@ -115,18 +115,9 @@ if($contigsFile eq "")
 }
 else{ 
 	
-	system("gmhmmp_heuristic.pl -s $contigsFile -out tmpGM -a");
-	$fastaProt = gm2fasta("tmpGM.lst");
-	system("gmhmmp_heuristic.pl -s $contigsFile -out tmpGM -d");
-	$fastaNucl = gm2fasta("tmpGM.lst");
+	system("prodigal -q -a $prefix.prod.faa -d $prefix.prod.cds -i $contigsFile ");
+
 	
-	#### CDS and ORF prediction ######
-	open(FPROT,">$prefix.gm.faa");
-	print FPROT $fastaProt;
-	close FPROT;
-	open(FNUCL, ">$prefix.gm.cds");
-	print FNUCL $fastaNucl;
-	close FNUCL;
 }
 		
 #}
@@ -199,11 +190,11 @@ sub blastRefDB     #### blastRefDB(type)
 	
 	if($type eq "blast")
 	{
-		system("blastn -query $contigsFile -db $refDBFile -out tmpMegaBlast.txt -num_alignments 0 -evalue 1e-25");
+		system("blastn -query $contigsFile -db $refDBFile -out tmpMegaBlast.txt -num_alignments 0 -evalue 1e-25 -num_threads 24");
 	}elsif ($type eq "fasta")
 	{
 		system("makeblastdb -in $refDBFile -out tmpRefDB -dbtype nucl");
-		system("blastn -query $contigsFile -db tmpRefDB -out tmpMegaBlast.txt -num_alignments 0 -evalue 1e-25");
+		system("blastn -query $contigsFile -db tmpRefDB -out tmpMegaBlast.txt -num_alignments 0 -evalue 1e-25 -num_threads 24");
 	}else{
 		print "Error in Reference DB format\n";
 		exit 0;
@@ -222,7 +213,7 @@ sub blastRefDB     #### blastRefDB(type)
 			$n=1;
 			#print "$node\n";
 		}
-		if ($l =~ /gi\|/) 
+		if ($l =~ /refDB\|/) 
 		{
 			#print $l;
 			@c = split(' ',$l);
@@ -253,8 +244,8 @@ sub sam2scaffold    #### attr:	sam2scaffold(SamDefinition)
 	@c = split('\t',$s);
 	
 	my $samFile = $c[0];
-	my $readLength = $c[1];
-	my $insert = $c[2];
+	my $insert = $c[1];
+	my $readLength = $c[2];
 	
 	print "$c[0]\t$c[1]\t$c[2]\n";
 	
@@ -393,7 +384,7 @@ sub database   #### Attributes name,fastaFile,type,threshold
 	
 	if($dbType eq "prot")
 	{
-		system("blastp -query $prefix.gm.faa -db $db -outfmt 6 -evalue $evalue -out tmp$name.blast -num_alignments 1"); 
+		system("blastp -query $prefix.prod.faa -db $db -outfmt 6 -evalue $evalue -out tmp$name.blast -num_alignments 1"); 
 	}elsif ($dbType eq "nucl")
 	{
 		system("blastn -query $contigsFile -db $db -outfmt 6 -evalue $evalue -out tmp$name.blast -num_alignments 1");
@@ -410,8 +401,9 @@ sub database   #### Attributes name,fastaFile,type,threshold
 		foreach $line (@dbText)
 		{
 			@fields1 = split('\t',$line);
-			@fields2 = split('\|',$fields1[0]);
-			print OUT "$fields2[1]\t$fields1[1]\n";
+			@fields2 = split('_',$fields1[0]);
+			#@node_name = "$fields2[0]_$fields2[1]_$fields2[2]_$fields2[3]_$fields2[4]_$fields2[5]";
+			print OUT "$fields2[0]_$fields2[1]\t$fields1[1]\n";
 		}
 		close OUT;
 	}else{
@@ -424,45 +416,12 @@ sub database   #### Attributes name,fastaFile,type,threshold
 	}	
 }
 	
-sub gm2fasta ############## attr: (geneMarkOutput.lst) return: fasta
-{
-	
-	open(A,"tmpGM");
-	my @txt = <A>;
-	close A;
-
 
-	my $out ="";
-	my $cond=0;
-	foreach $line (@txt)
-	{
-		if($line =~ />/)
-		{
-			$line =~ s/\|GeneMark\.hmm\|\d+_(aa|nt)\|(\-|\+)\|\d+\|\d+\t>/\|/;
-		}
-		if($line =~ /#===/)
-		{
-			$cond=0;
-		}
-		if ($cond==1)
-		{
-			$out .= $line;
-		}
-		if($line =~ /Predicted proteins:/ | $line =~ /Nucleotide sequence of predicted genes:/)
-		{
-		   $cond=1;
-		}
-	}
-	
-	
-	return $out;
-	
-}
 
 sub usage
 {
 	print "Usage:\n\n";
-	print "Placnet v1.03 10/06/2015\n";
+	print "Placnet v1.04 10/15/20116\n";
 	print "writen by: Val F. Lanza (valfernandez.vf\@gmail.com) and Maria de Toro (mdtorohernando\@gmail.com\n\n";\
 	print "Please cite PLACNET as: \nLanza VF, de Toro M, Garcillán-Barcia MP, Mora A, Blanco J, Coque TM, de la Cruz F: \nPlasmid Flux in Escherichia coli ST131 Sublineages, Analyzed by Plasmid Constellation Network (PLACNET),\na New Method for Plasmid Reconstruction from Whole Genome Sequences. \nPLoS Genet 2014, 10:e1004766\n\n";
 	print "Write inputFile Template\n\nplacnet.pl -generate\n\n";
@@ -492,7 +451,7 @@ SAM:	file2.sam	readLength2	insertSize2
 REFDB:	refdb	type(fasta/blast)
 
 
-##### Optional Attributes
+##### Optional Attibutes
 
 DB1:	name	file.fasta	type(nucl/prot)	threshold(E value)	format (fasta/blast)
 DB2:	name	file.fasta	type(nucl/prot)	threshold(E value)	format (fasta/blast)



View it on GitLab: https://salsa.debian.org/med-team/placnet/-/commit/e8962f522d9f3ba310c6cd9ee2e8e2d498ef870b

-- 
View it on GitLab: https://salsa.debian.org/med-team/placnet/-/commit/e8962f522d9f3ba310c6cd9ee2e8e2d498ef870b
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210928/45acb90f/attachment-0001.htm>