[med-svn] [placnet] 01/03: Imported Upstream version 1.03

Andreas Tille tille at debian.org
Mon Oct 12 13:12:43 UTC 2015


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository placnet.

commit 676867a30716fc4fedf011245f69e18571cf41e9
Author: Andreas Tille <tille at debian.org>
Date:   Mon Oct 12 14:50:22 2015 +0200

    Imported Upstream version 1.03
---
 placnet.pl | 503 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 503 insertions(+)

diff --git a/placnet.pl b/placnet.pl
new file mode 100644
index 0000000..b24b307
--- /dev/null
+++ b/placnet.pl
@@ -0,0 +1,503 @@
+#!/usr/bin/perl  
+#v1.03
+
+use strict;
+#use warnings;
+use Getopt::Long;
+
+
+### Variables
+
+my @Options;
+my $inputFile;
+my $prefix;
+my @inputText;
+my $contigsFile;
+my $refDBFile;
+my @sams;
+my @DBs;
+my $MINLENGTH;
+my $blastNet;
+my $scaffNet ="";
+my $fastaProt;
+my $fastaNucl;
+my $dbRefType;
+my $db;
+
+### temp variables
+my $line;
+my @fields;
+my @c;
+my $l;
+my $s;
+
+ at Options = (
+	{OPT=>"f=s",	VAR=>\$inputFile,	DESC=>"Input file for PLACNET analysis"},
+	{OPT=>"p=s",	VAR=>\$prefix,	DEFAULT => 'placnet', DESC=>"Prefix name for output files"},
+	{OPT=>"l=s",	VAR=>\$MINLENGTH,	DEFAULT => '200', DESC=>"Minimun contig length to scaffold process"}
+	
+);
+
+#Check options and set variables
+
+
+
+(@ARGV < 1) && (usage());
+if ($ARGV[0] eq "-generate")
+{
+	generateInputFile();
+	exit 0;
+}
+
+
+GetOptions(map {$_->{OPT}, $_->{VAR}} @Options) || usage();
+
+# Now setup default values.
+foreach (@Options) {
+	if (defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
+	${$_->{VAR}} = $_->{DEFAULT};
+	}
+}
+
+
+
+unless($inputFile){
+	print STDERR "You must specified the input files\n";
+	&usage();
+}
+
+open(INPUT,$inputFile);
+ at inputText = <INPUT>;
+close INPUT;
+
+print @inputText;
+
+
+
+####   PARSING INPUT FILE   #############
+
+foreach $line (@inputText)
+{
+	if($line =~ /^CONTIGS/)
+	{
+		chomp $line;
+		@fields = split('\t',$line);
+		$contigsFile = $fields[1];
+	}
+	if($line =~ /^REFDB/)
+	{
+		chomp $line;
+		@fields = split('\t',$line);
+		$refDBFile = $fields[1];
+		$dbRefType = $fields[2];
+	}
+	if($line =~ /^SAM/)
+	{
+		chomp $line;
+		$line =~ s/SAM:\t//g;
+		push(@sams,$line);
+	}
+	if($line =~ /^DB/)
+	{
+		chomp $line;
+		$line =~ s/DB\d*:\t//g;
+		push(@DBs,$line);
+	}
+	
+}
+
+###   CHECKING INPUT FILE   ##########
+
+if($contigsFile eq "")
+{
+	print "\nError in $inputFile file, Please specify a contig file\n\n";
+	exit;
+}
+else{ 
+	
+	system("gmhmmp_heuristic.pl -s $contigsFile -out tmpGM -a");
+	$fastaProt = gm2fasta("tmpGM.lst");
+	system("gmhmmp_heuristic.pl -s $contigsFile -out tmpGM -d");
+	$fastaNucl = gm2fasta("tmpGM.lst");
+	
+	#### CDS and ORF prediction ######
+	open(FPROT,">$prefix.gm.faa");
+	print FPROT $fastaProt;
+	close FPROT;
+	open(FNUCL, ">$prefix.gm.cds");
+	print FNUCL $fastaNucl;
+	close FNUCL;
+}
+		
+#}
+if($refDBFile eq "")
+{
+	print "\nError in $inputFile file, Please specify a reference database file\n\n";
+	exit 0;
+}
+if(scalar(@sams)<1)
+{
+	print "\nError in $inputFile file, Please specify a SAM file file\n\n";
+	exit 0;
+}
+
+############## MAIN ROUTINE####################
+
+
+
+print "\n\nMaking blast against $refDBFile .........\n\n";
+$blastNet = blastRefDB($dbRefType);
+
+
+
+foreach $s (@sams)
+{
+	chomp $s;
+	@c = split('\t',$s);
+	print "\nMaking scaffold network from $c[0]\n\n";
+	
+	$scaffNet .= sam2scaffold($s);
+}
+
+
+open(OUT,">$prefix.net.csv");
+print OUT $blastNet;
+print OUT $scaffNet;
+close OUT;
+
+foreach $db (@DBs)
+{
+		@fields = split('\t',$db);	
+		database($fields[0],$fields[1],$fields[2],$fields[3],$fields[4]);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+################################################
+
+
+#### BLASTING REFDB   #########
+
+sub blastRefDB     #### blastRefDB(type)
+{
+	my $node;
+	my $value;
+	my $n;
+	my $l;
+	my $out;
+	my $type = $_[0];
+	
+	if($type eq "blast")
+	{
+		system("blastn -query $contigsFile -db $refDBFile -out tmpMegaBlast.txt -num_alignments 0 -evalue 1e-25");
+	}elsif ($type eq "fasta")
+	{
+		system("makeblastdb -in $refDBFile -out tmpRefDB -dbtype nucl");
+		system("blastn -query $contigsFile -db tmpRefDB -out tmpMegaBlast.txt -num_alignments 0 -evalue 1e-25");
+	}else{
+		print "Error in Reference DB format\n";
+		exit 0;
+	}
+	open(A,"tmpMegaBlast.txt");
+	my @mega = <A>;
+	close A;
+	
+	foreach $l (@mega)
+	{
+		$l =~ s/;//g;
+		if ($l =~ /Query= (.*)\n/)
+		{
+			$node = $1;
+			$value =0;
+			$n=1;
+			#print "$node\n";
+		}
+		if ($l =~ /gi\|/) 
+		{
+			#print $l;
+			@c = split(' ',$l);
+	
+			if ($value == 0)
+				{
+				$value = $c[-2];
+			}
+			if($c[-2]/$value > 0.85+(0.02*$n))
+			{
+				$n++;
+				$out .= "$node\thit\t$c[0]\t$c[-2]\t$c[-1]\n";
+				
+				$value = ($value + $c[-2])/2;
+			}
+		}
+	}
+	return $out;
+}
+
+
+##### SAM2SCAFFOLD subrutine ###############
+sub sam2scaffold    #### attr:	sam2scaffold(SamDefinition)
+{
+	### attributes
+	$s = $_[0];
+	chomp $s;
+	@c = split('\t',$s);
+	
+	my $samFile = $c[0];
+	my $readLength = $c[1];
+	my $insert = $c[2];
+	
+	print "$c[0]\t$c[1]\t$c[2]\n";
+	
+	my $FLANK = $insert + 2*$readLength;
+	
+	
+	
+	### Variables
+	my @txt;
+	my $l;
+	my $k;
+	my $j;
+	my $i;
+	my %contigs;
+	my %length;
+	my %scaffolds;
+	my $sum;
+	my $avg;
+	my $out = "";
+	my $header;
+	my @ks;
+	
+	open(CONTIGS,$contigsFile);
+	@txt = <CONTIGS>;
+	close CONTIGS;
+
+	foreach $l (@txt)
+	{
+		chomp $l;
+		if($l =~ /^>(.*)/)
+		{
+			$header = $1;
+			$header =~ s/\s//g;
+			
+		}else{
+			$contigs{$header} .= $l;
+		}
+	}
+	foreach $k (keys(%contigs))
+	{
+		$contigs{$k} =~ s/\s//g;
+		$length{$k} = length($contigs{$k});
+				
+	}
+	
+	
+	open(SAM,$samFile);
+	
+	
+### SAM Format specifications	
+	#$c[0]	1	QNAME	Query template/pair NAME
+	#$c[1]	2	FLAG	bitwise FLAG
+	#$c[2]	3	RNAME	Reference sequence NAME
+	#$c[3]	4	POS	1-based leftmost POSition/coordinate of clipped sequence
+	#$c[4]	5	MAPQ	MAPping Quality (Phred-scaled)
+	#$c[5]	6	CIAGR	extended CIGAR string
+	#$c[6]	7	MRNM	Mate Reference sequence NaMe (‘=’ if same as RNAME)
+	#$c[7]	8	MPOS	1-based Mate POSistion
+	#$c[8]	9	TLEN	inferred Template LENgth (insert size)
+	#$c[9]	10	SEQ	query SEQuence on the same strand as the reference
+	#$c[10]	11	QUAL	query QUALity (ASCII-33 gives the Phred base quality)
+	#$c[11]	12+	OPT	variable OPTional fields in the format TAG:VTYPE:VALUE
+####
+	
+	while ($l=<SAM>)
+	{
+		@c = split('\t',$l);
+		if ($c[6] !~ /\*|=/ & $length{$c[6]} > $MINLENGTH & $length{$c[2]} > $MINLENGTH ) ### Only contigs longer than MINLENGTH
+		{
+			
+			if($c[3] <= $FLANK | abs($length{$c[2]}-$c[3]) <= $FLANK && ($c[7] <= $FLANK | abs($length{$c[6]}-$c[7]) <= $FLANK))
+			{
+				
+				$scaffolds{$c[2]}{$c[6]}++;
+				$scaffolds{$c[6]}{$c[2]}++;
+				
+			}
+		}
+		
+	}
+	close SAM;
+	
+	$sum =0;
+	foreach $k (keys(%length))
+	{
+		foreach $j (keys(%length))
+		{
+			$sum += $scaffolds{$k}{$j};
+		}
+	}
+	
+	$avg = ($sum/2)/(1+scalar(keys(%scaffolds)));  ##### $sum/2 for simetric matrix
+	
+	
+	@ks = keys(%length);
+	for($i=0; $i<scalar(@ks); $i++)
+	{
+		for($j=$i+1; $j<scalar(@ks);$j++)
+		{
+				if($scaffolds{$ks[$i]}{$ks[$j]} > $avg*0.30)   #### Only reports scaffold with abundance over 30% of the mean scaffold abundance.
+				{
+					$out .= "$ks[$i]\tscaff\t$ks[$j]\t\t\t$scaffolds{$ks[$i]}{$ks[$j]}\n";
+				}
+		}
+	}
+	return $out;
+	
+}
+
+sub database   #### Attributes name,fastaFile,type,threshold
+{
+	my $name = $_[0];
+	my $dbFastaFile = $_[1];
+	my $dbType = $_[2];
+	my $evalue = $_[3];
+	my $formatDB =$_[4];
+	my @dbText;
+	my $line;
+	my @fields1;
+	my @fields2;
+	my $db = "tmpTypeDB";
+	
+	
+	if($formatDB eq "fasta")
+	{
+		
+		system("makeblastdb -in $dbFastaFile -out $db -dbtype $dbType");
+		
+	}elsif ($formatDB eq "blast")
+	{
+		$db = $dbFastaFile;
+	}else{
+		print "Error: unvalid format of $name database.\n";
+		exit 0;
+	}
+	
+	if($dbType eq "prot")
+	{
+		system("blastp -query $prefix.gm.faa -db $db -outfmt 6 -evalue $evalue -out tmp$name.blast -num_alignments 1"); 
+	}elsif ($dbType eq "nucl")
+	{
+		system("blastn -query $contigsFile -db $db -outfmt 6 -evalue $evalue -out tmp$name.blast -num_alignments 1");
+	}
+	
+	open(DB,"tmp$name.blast");
+	@dbText = <DB>;
+	close DB;
+
+	open(OUT,">$name.annot");
+	
+	if($dbType eq "prot")
+	{
+		foreach $line (@dbText)
+		{
+			@fields1 = split('\t',$line);
+			@fields2 = split('\|',$fields1[0]);
+			print OUT "$fields2[1]\t$fields1[1]\n";
+		}
+		close OUT;
+	}else{
+		foreach $line (@dbText)
+		{
+			@fields1 = split('\t',$line);
+			print OUT "$fields1[0]\t$fields1[1]\n";
+		}
+		close OUT;	
+	}	
+}
+	
+sub gm2fasta ############## attr: (geneMarkOutput.lst) return: fasta
+{
+	
+	open(A,"tmpGM");
+	my @txt = <A>;
+	close A;
+
+
+	my $out ="";
+	my $cond=0;
+	foreach $line (@txt)
+	{
+		if($line =~ />/)
+		{
+			$line =~ s/\|GeneMark\.hmm\|\d+_(aa|nt)\|(\-|\+)\|\d+\|\d+\t>/\|/;
+		}
+		if($line =~ /#===/)
+		{
+			$cond=0;
+		}
+		if ($cond==1)
+		{
+			$out .= $line;
+		}
+		if($line =~ /Predicted proteins:/ | $line =~ /Nucleotide sequence of predicted genes:/)
+		{
+		   $cond=1;
+		}
+	}
+	
+	
+	return $out;
+	
+}
+
+sub usage
+{
+	print "Usage:\n\n";
+	print "Placnet v1.03 10/06/2015\n";
+	print "writen by: Val F. Lanza (valfernandez.vf\@gmail.com) and Maria de Toro (mdtorohernando\@gmail.com\n\n";\
+	print "Please cite PLACNET as: \nLanza VF, de Toro M, Garcillán-Barcia MP, Mora A, Blanco J, Coque TM, de la Cruz F: \nPlasmid Flux in Escherichia coli ST131 Sublineages, Analyzed by Plasmid Constellation Network (PLACNET),\na New Method for Plasmid Reconstruction from Whole Genome Sequences. \nPLoS Genet 2014, 10:e1004766\n\n";
+	print "Write inputFile Template\n\nplacnet.pl -generate\n\n";
+	print "Network process\n\nplacnet.pl -f inputFile.txt -p prefix -l min\n\n\n";
+
+	foreach (@Options) {
+		
+		printf "  -%-13s %s%s.\n",$_->{OPT},$_->{DESC},
+			defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
+	}
+	print "\n\n\n";
+	exit(1);
+}
+
+sub generateInputFile
+{
+	open(OUT,">inputFile.txt");
+	my $text = "### Placnet Input File ###
+
+
+
+CONTIGS:	multifasta.fasta
+SAM:	file1.sam	readLength1	insertSize1
+SAM:	file2.sam	readLength2	insertSize2
+#...
+
+REFDB:	refdb	type(fasta/blast)
+
+
+##### Optional Attributes
+
+DB1:	name	file.fasta	type(nucl/prot)	threshold(E value)	format (fasta/blast)
+DB2:	name	file.fasta	type(nucl/prot)	threshold(E value)	format (fasta/blast)
+DB3:	name	file.fasta	type(nucl/prot)	threshold(E value)	format (fasta/blast)
+#....";
+	print OUT $text;
+	close OUT;
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/placnet.git



More information about the debian-med-commit mailing list