[med-svn] [transdecoder] 01/01: Imported Upstream version 2.1.0+dfsg
    Michael Crusoe 
    misterc-guest at moszumanska.debian.org
       
    Wed May 18 09:29:36 UTC 2016
    
    
  
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to annotated tag upstream/2.1.0+dfsg
in repository transdecoder.
commit 9d2e299a03d6df33474253dee0f6da7cb2842385
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date:   Tue May 17 02:56:19 2016 -0700
    Imported Upstream version 2.1.0+dfsg
---
 Release.Notes => Changelog.txt                     |   8 ++
 PerlLib/Fasta_retriever.pm                         |  93 ++++++++++++++
 PerlLib/Longest_orf.pm                             |  69 ++++++++++-
 TransDecoder.LongOrfs                              |  14 +++
 TransDecoder.Predict                               |   9 +-
 sample_data/README.md                              |  12 ++
 .../blastp.results.outfmt6.gz                      | Bin
 sample_data/{ => cufflinks_example}/cleanme.pl     |   0
 .../{ => cufflinks_example}/pfam.domtblout.gz      | Bin
 sample_data/{ => cufflinks_example}/runMe.sh       |  17 ++-
 .../{ => cufflinks_example}/test.genome.fasta.gz   | Bin
 .../{ => cufflinks_example}/test.tophat.sam.gz     | Bin
 .../{ => cufflinks_example}/transcripts.gtf.gz     | Bin
 sample_data/{ => pasa_example}/cleanme.pl          |  18 ++-
 sample_data/pasa_example/genome.fasta.gz           | Bin 0 -> 852337 bytes
 sample_data/pasa_example/pasa_assemblies.fasta.gz  | Bin 0 -> 304896 bytes
 sample_data/pasa_example/pasa_assemblies.gff3.gz   | Bin 0 -> 55455 bytes
 sample_data/pasa_example/runMe.sh                  |  21 ++++
 .../simple_transcriptome_target/Trinity.fasta.gz   | Bin 0 -> 359112 bytes
 .../{ => simple_transcriptome_target}/cleanme.pl   |  16 +--
 sample_data/simple_transcriptome_target/runMe.sh   |  11 ++
 util/cdna_alignment_orf_to_genome_orf.pl           |  43 +++++--
 util/create_start_PSSM.pl                          | 131 ++++++++++++++++++++
 util/cufflinks_gtf_to_alignment_gff3.pl            |  10 +-
 util/cufflinks_gtf_to_bed.pl                       |   4 +-
 util/extract_FL_subset.pl                          | 134 +++++++++++++++++++++
 util/get_FL_accs.pl                                |  31 +++++
 util/get_longest_ORF_per_transcript.pl             |  63 ++++++++++
 28 files changed, 652 insertions(+), 52 deletions(-)
diff --git a/Release.Notes b/Changelog.txt
similarity index 77%
rename from Release.Notes
rename to Changelog.txt
index 025a2ba..3be9d58 100644
--- a/Release.Notes
+++ b/Changelog.txt
@@ -1,3 +1,11 @@
+## 2016-03-11 v2.1 release
+-added cpu parameter to TransDecoder.predict
+-retaining gene identifier information from cufflinks output
+-added sample data and examples for the various use-cases.
+
+
+
+
 
 ## 2015-01-26   v2.0 release
 
diff --git a/PerlLib/Fasta_retriever.pm b/PerlLib/Fasta_retriever.pm
new file mode 100755
index 0000000..2ced6ea
--- /dev/null
+++ b/PerlLib/Fasta_retriever.pm
@@ -0,0 +1,93 @@
+package Fasta_retriever;
+
+use strict;
+use warnings;
+use Carp;
+use threads;
+use threads::shared;
+
+sub new {
+    my ($packagename) = shift;
+    my $filename = shift;
+    
+    unless ($filename) {
+        confess "Error, need filename as param";
+    }
+
+    my $self = { filename => $filename,
+                 acc_to_pos_index => undef,
+                 fh => undef,
+    };
+    
+    my %acc_to_pos_index :shared;
+
+    $self->{acc_to_pos_index} = \%acc_to_pos_index;
+        
+    bless ($self, $packagename);
+
+    $self->_init();
+
+
+    return($self);
+}
+
+
+sub _init {
+    my $self = shift;
+    
+    my $filename = $self->{filename};
+        
+    open (my $fh, $filename) or die $!;
+    $self->{fh} = $fh;
+    while (<$fh>) {
+        if (/>(\S+)/) {
+            my $acc = $1;
+            my $file_pos = tell($fh);
+            $self->{acc_to_pos_index}->{$acc} = $file_pos;
+        }
+    }
+    
+    return;
+}
+
+sub refresh_fh {
+    my $self = shift;
+    
+    open (my $fh, $self->{filename}) or die "Error, cannot open file : " . $self->{filename};
+    $self->{fh} = $fh;
+    
+    return;
+}
+
+
+sub get_seq {
+    my $self = shift;
+    my $acc = shift;
+
+    unless ($acc) {
+        confess "Error, need acc as param";
+    }
+
+    my $file_pos = $self->{acc_to_pos_index}->{$acc} or confess "Error, no seek pos for acc: $acc";
+    
+    my $fh = $self->{fh};
+    seek($fh, $file_pos, 0);
+    
+    my $seq = "";
+    while (<$fh>) {
+        if (/^>/) {
+            last;
+        }
+        $seq .= $_;
+    }
+
+    $seq =~ s/\s+//g;
+
+    return($seq);
+}
+    
+    
+    
+
+1; #EOM
+    
diff --git a/PerlLib/Longest_orf.pm b/PerlLib/Longest_orf.pm
index 61737c5..cbc2299 100755
--- a/PerlLib/Longest_orf.pm
+++ b/PerlLib/Longest_orf.pm
@@ -169,13 +169,13 @@ sub capture_all_ORFs {
     if (@orfs) {
 		## set in order of decreasing length
 		@orfs = reverse sort {$a->{length} <=> $b->{length}} @orfs;
-		
 		my $longest_orf = $orfs[0];
+		my $seq = $longest_orf->{sequence};
+    my $length = length($seq);
+    my $protein = &translate_sequence($seq, 1);
+		
 		my $start = $longest_orf->{start};
 		my $stop = $longest_orf->{stop};
-		my $seq = $longest_orf->{sequence};
-		my $length = length($seq);
-		my $protein = &translate_sequence($seq, 1);
 		$self->{end5} = $start;  ## now coord is seq_based instead of array based.
 		$self->{end3} = $stop;
 		$self->{length} = $length;
@@ -187,6 +187,44 @@ sub capture_all_ORFs {
 	return (@orfs);
 }
 
+sub check_for_fp_5prime_partials {
+  my($self,$min_len,$orfs_ref,$pct_cutoff) = @_;
+  my @orfs = @$orfs_ref;
+  print "IN fp $#orfs\n" if($SEE);
+  $pct_cutoff /= 100;
+  for(my $i = 0; $i <= $#orfs; $i++) {
+    use Data::Dumper;
+
+    next if($orfs[$i]->{type} ne '5prime_partial');
+    my $fl_start = index($orfs[$i]->{protein}, 'M');
+    next if($fl_start == -1);
+
+ 
+    my $fl_protein = substr($orfs[$i]->{protein}, $fl_start);
+    print "partial with M\nnew prot: $fl_protein\n" if($SEE);
+    print Dumper($orfs[$i]) if($SEE);
+
+    next if(length($fl_protein) < $min_len);
+    next if(length($fl_protein) < $pct_cutoff*length($orfs[$i]->{protein}));
+    next if($i < $#orfs && length($orfs[$i+1]->{protein}) > length($fl_protein));
+ 
+    print "seq start: $orfs[$i]->{start} new start: ".($orfs[$i]->{start} + $fl_start*3)." length: $orfs[$i]->{length}\n" if($SEE);
+    $orfs[$i]->{sequence} = substr($orfs[$i]->{sequence},$fl_start*3);
+    $orfs[$i]->{length} = length($orfs[$i]->{sequence});
+    if($orfs[$i]->{orient} eq '+') {
+      $orfs[$i]->{start} = $orfs[$i]->{start} +$fl_start*3;
+    } else {
+      $orfs[$i]->{start} = $orfs[$i]->{start} -$fl_start*3;
+    }
+
+    $orfs[$i]->{protein} = $fl_protein;
+    $orfs[$i]->{type} = 'complete';
+    
+    print Dumper($orfs[$i]) if($SEE);
+  }
+
+}
+
 sub orfs {
     my $self = shift;
     return (@{$self->{all_ORFS}});
@@ -271,8 +309,31 @@ sub get_orfs {
 				if ($protein =~ /\*.*\*/) {
 				  confess "Fatal Error: Longest_orf: ORF returned which contains intervening stop(s): ($start-$stop, $direction\nProtein:\n$protein\nOf Nucleotide Seq:\n$seq\n";
 				}
+
+        my $got_start = 0;
+        my $got_stop = 0;
+        if ($protein =~ /^M/) {
+            $got_start = 1;
+        } 
+        if ($protein =~ /\*$/) {
+            $got_stop = 1;
+        }
+        
+        my $prot_type = "";
+        if ($got_start && $got_stop) {
+            $prot_type = "complete";
+        } elsif ($got_start) {
+            $prot_type = "3prime_partial";
+        } elsif ($got_stop) {
+            $prot_type = "5prime_partial";
+        } else {
+            $prot_type = "internal";
+        }
+        print "Setting type $prot_type\n" if($SEE);
+
 				my $orf = { sequence => $orfSeq,
 							protein => $protein,
+              type => $prot_type,
 							start=>$start,
 							stop=>$stop,
 							length=>length($orfSeq),
diff --git a/TransDecoder.LongOrfs b/TransDecoder.LongOrfs
index df68e5b..68b30a4 100755
--- a/TransDecoder.LongOrfs
+++ b/TransDecoder.LongOrfs
@@ -22,6 +22,8 @@ Optional:
 
  -S                                     strand-specific (only analyzes top strand)
 
+ -p <int>                               shorten potential 5' partials if they are this percentage of the original protein or longer.
+
 =head1 Genetic Codes
 
 See L<http://golgi.harvard.edu/biolinks/gencode.html>. These are currently supported:
@@ -72,6 +74,8 @@ my ($transcripts_file);
 
 my $genetic_code='universal';
 
+my $preserve_full_lengths = undef;
+
 my $TOP_STRAND_ONLY = 0;
 
 my $help;
@@ -89,6 +93,7 @@ my $MPI_DEBUG = 1;
              'h' => \$help,
              'v' => \$verbose,
              'S' => \$TOP_STRAND_ONLY, 
+             'p=i' => \$preserve_full_lengths
              );
 
 
@@ -170,6 +175,9 @@ main: {
 		my @orf_structs = $longest_orf_finder->capture_all_ORFs($sequence);
 		
 		@orf_structs = reverse sort {$a->{length}<=>$b->{length}} @orf_structs;
+    print "checking for fp in $acc \n" if ($SEE);
+    $longest_orf_finder->check_for_fp_5prime_partials($MIN_PROT_LENGTH,\@orf_structs,$preserve_full_lengths) 
+       if(defined($preserve_full_lengths));
 		
         while (@orf_structs) {
             my $orf = shift @orf_structs;
@@ -200,6 +208,8 @@ main: {
             if ($start < 1) {
                 $start += 3;
             }
+
+            print "Candidate (len $length): ".Dumper($orf) if($SEE);
             
                         
             if ($length < $MIN_PROT_LENGTH) { next; }
@@ -210,6 +220,10 @@ main: {
             my $gene_obj = new Gene_obj();
             
 
+            print "dumping coords for $acc $start $stop\n" if($SEE);
+            print Dumper(%$cds_coords_href) if($SEE);
+            print Dumper(%$exon_coords_href) if($SEE);
+            
             $gene_obj->populate_gene_object($cds_coords_href, $exon_coords_href);
             $gene_obj->{asmbl_id} = $acc;
             
diff --git a/TransDecoder.Predict b/TransDecoder.Predict
index 912d921..a94080a 100755
--- a/TransDecoder.Predict
+++ b/TransDecoder.Predict
@@ -22,6 +22,7 @@ Common options:
  
  --retain_blastp_hits <string>
 
+ --cpu <int>                            Use multipe cores for cd-hit-est. (default=1)
 
 
 Advanced options
@@ -30,7 +31,8 @@ Advanced options
                                         longest non-redundant ORFs used
 
  -T <int>                               If no --train, top longest ORFs to train Markov Model (hexamer stats) (default: 500)
-
+                                        Note, 10x this value are first selected for use with cd-hit to remove redundancies,
+                                        and then this -T value of longest ORFs are selected from the non-redundant set.
 
 =cut
 
@@ -74,7 +76,7 @@ my $RETAIN_LONG_ORFS = 900;
 
 my $retain_pfam_hits_file;
 my $retain_blastp_hits_file;
-
+my $cpu = 1;
 my $MPI_DEBUG = 1;
 
 &GetOptions( 't=s' => \$transcripts_file,
@@ -94,6 +96,7 @@ my $MPI_DEBUG = 1;
              
              'retain_pfam_hits=s' => \$retain_pfam_hits_file,
              'retain_blastp_hits=s' => \$retain_blastp_hits_file,
+             'cpu=i' => \$cpu,
              
              );
 
@@ -141,7 +144,7 @@ main: {
             my $red_num = $top_ORFs_train * 10;
             my $red_num_cds_longest_file = "$cds_file.top_longest_${red_num}";
             &process_cmd("$UTIL_DIR/get_top_longest_fasta_entries.pl $cds_file $red_num > $red_num_cds_longest_file");
-            &process_cmd("$cd_hit_est_exec -r 1 -i $red_num_cds_longest_file -c 0.80 -o $red_num_cds_longest_file.nr80 -M 0 ");
+            &process_cmd("$cd_hit_est_exec -r 1 -i $red_num_cds_longest_file -T $cpu -c 0.80 -o $red_num_cds_longest_file.nr80 -M 0 ");
             &process_cmd("$UTIL_DIR/get_top_longest_fasta_entries.pl $red_num_cds_longest_file.nr80 $top_ORFs_train > $top_cds_file");
         
             &process_cmd("touch $checkpoint");
diff --git a/sample_data/README.md b/sample_data/README.md
new file mode 100644
index 0000000..900b6cb
--- /dev/null
+++ b/sample_data/README.md
@@ -0,0 +1,12 @@
+# TransDecoder examples
+
+Here are three examples of how TransDecoder can be applied.
+
+The simplest is in 'simple_transcriptome_target/', where TransDecoder is applied to a small Trinity.fasta file containing some assembled transcript sequences.  Here there is no reference genome to compare to.
+
+Other examples where a reference genome is available include starting with Cufflinks output or PASA output.
+
+The 'cufflinks_example/' contains a small genome sequence and cufflinks transcript annotations in GTF file format.  Here, the transcript sequences are extracted, TransDecoder is executed to find likely coding regions, and the coding region annotations are generated to reflect the coding regions on the genome sequence.
+
+Similarly, in the 'pasa_example/', we start with a set of PASA-reconstructed transcript sequences, the transcript structure annotation on the genome, and a small genome sequence.  Likely coding regions are identified in the PASA assemblies, and a genome annotation file is generated to describe these coding regions in the genomic context.
+
diff --git a/sample_data/blastp.results.outfmt6.gz b/sample_data/cufflinks_example/blastp.results.outfmt6.gz
similarity index 100%
rename from sample_data/blastp.results.outfmt6.gz
rename to sample_data/cufflinks_example/blastp.results.outfmt6.gz
diff --git a/sample_data/cleanme.pl b/sample_data/cufflinks_example/cleanme.pl
similarity index 100%
copy from sample_data/cleanme.pl
copy to sample_data/cufflinks_example/cleanme.pl
diff --git a/sample_data/pfam.domtblout.gz b/sample_data/cufflinks_example/pfam.domtblout.gz
similarity index 100%
rename from sample_data/pfam.domtblout.gz
rename to sample_data/cufflinks_example/pfam.domtblout.gz
diff --git a/sample_data/runMe.sh b/sample_data/cufflinks_example/runMe.sh
similarity index 59%
rename from sample_data/runMe.sh
rename to sample_data/cufflinks_example/runMe.sh
index fe9ea42..fc27d70 100755
--- a/sample_data/runMe.sh
+++ b/sample_data/cufflinks_example/runMe.sh
@@ -21,37 +21,36 @@ if [ -e pfam.domtblout.gz ] && [ ! -e pfam.domtblout ]; then
 fi
 
 
-
 ## generate alignment gff3 formatted output
-../util/cufflinks_gtf_to_alignment_gff3.pl transcripts.gtf > transcripts.gff3
+../../util/cufflinks_gtf_to_alignment_gff3.pl transcripts.gtf > transcripts.gff3
 
 ## generate transcripts fasta file
-../util/cufflinks_gtf_genome_to_cdna_fasta.pl transcripts.gtf test.genome.fasta > transcripts.fasta 
+../../util/cufflinks_gtf_genome_to_cdna_fasta.pl transcripts.gtf test.genome.fasta > transcripts.fasta 
 
 ## Extract the long ORFs
-../TransDecoder.LongOrfs -t transcripts.fasta
+../../TransDecoder.LongOrfs -t transcripts.fasta
 
 
 ## Predict likely ORFs
 if [ $1 ]; then
     ## use pfam and blast results:
-    ../TransDecoder.Predict  -t transcripts.fasta --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.results.outfmt6 -v
+    ../../TransDecoder.Predict  -t transcripts.fasta --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.results.outfmt6 -v
 else
     # just coding metrics
-    ../TransDecoder.Predict -t transcripts.fasta 
+    ../../TransDecoder.Predict -t transcripts.fasta 
 fi
 
 ## convert to genome coordinates
-../util/cdna_alignment_orf_to_genome_orf.pl transcripts.fasta.transdecoder.gff3 transcripts.gff3 transcripts.fasta > transcripts.fasta.transdecoder.genome.gff3
+../../util/cdna_alignment_orf_to_genome_orf.pl transcripts.fasta.transdecoder.gff3 transcripts.gff3 transcripts.fasta > transcripts.fasta.transdecoder.genome.gff3
 
 
 ## make bed files for viewing with GenomeView
 
 # covert cufflinks gtf to bed
-../util/cufflinks_gtf_to_bed.pl transcripts.gtf > transcripts.bed
+../../util/cufflinks_gtf_to_bed.pl transcripts.gtf > transcripts.bed
 
 # convert the genome-based gene-gff3 file to bed
-../util/gff3_file_to_bed.pl transcripts.fasta.transdecoder.genome.gff3 > transcripts.fasta.transdecoder.genome.bed
+../../util/gff3_file_to_bed.pl transcripts.fasta.transdecoder.genome.gff3 > transcripts.fasta.transdecoder.genome.bed
 
 echo
 echo
diff --git a/sample_data/test.genome.fasta.gz b/sample_data/cufflinks_example/test.genome.fasta.gz
similarity index 100%
rename from sample_data/test.genome.fasta.gz
rename to sample_data/cufflinks_example/test.genome.fasta.gz
diff --git a/sample_data/test.tophat.sam.gz b/sample_data/cufflinks_example/test.tophat.sam.gz
similarity index 100%
rename from sample_data/test.tophat.sam.gz
rename to sample_data/cufflinks_example/test.tophat.sam.gz
diff --git a/sample_data/transcripts.gtf.gz b/sample_data/cufflinks_example/transcripts.gtf.gz
similarity index 100%
rename from sample_data/transcripts.gtf.gz
rename to sample_data/cufflinks_example/transcripts.gtf.gz
diff --git a/sample_data/cleanme.pl b/sample_data/pasa_example/cleanme.pl
similarity index 58%
copy from sample_data/cleanme.pl
copy to sample_data/pasa_example/cleanme.pl
index f5f72ad..88f5233 100755
--- a/sample_data/cleanme.pl
+++ b/sample_data/pasa_example/cleanme.pl
@@ -11,16 +11,12 @@ chdir $FindBin::Bin or die "error, cannot cd to $FindBin::Bin";
 
 
 
-my @files_to_keep = qw (cleanme.pl 
-                        runMe.sh
-                        test.genome.fasta.gz
-                        test.tophat.sam.gz
-                        transcripts.gtf.gz
-                 
-                 
-
-pfam.domtblout.gz
-blastp.results.outfmt6.gz
+my @files_to_keep = qw (
+cleanme.pl
+genome.fasta.gz
+pasa_assemblies.fasta.gz
+pasa_assemblies.gff3.gz
+runMe.sh
                         );
 
 
@@ -36,7 +32,7 @@ foreach my $file (<*>) {
 }
 
 
-`rm -rf ./transcripts.fasta.transdecoder_dir/`;
+`rm -rf ./pasa_assemblies.fasta.transdecoder_dir/`;
 
 
 exit(0);
diff --git a/sample_data/pasa_example/genome.fasta.gz b/sample_data/pasa_example/genome.fasta.gz
new file mode 100644
index 0000000..0954af6
Binary files /dev/null and b/sample_data/pasa_example/genome.fasta.gz differ
diff --git a/sample_data/pasa_example/pasa_assemblies.fasta.gz b/sample_data/pasa_example/pasa_assemblies.fasta.gz
new file mode 100644
index 0000000..3eca927
Binary files /dev/null and b/sample_data/pasa_example/pasa_assemblies.fasta.gz differ
diff --git a/sample_data/pasa_example/pasa_assemblies.gff3.gz b/sample_data/pasa_example/pasa_assemblies.gff3.gz
new file mode 100644
index 0000000..bdb2aac
Binary files /dev/null and b/sample_data/pasa_example/pasa_assemblies.gff3.gz differ
diff --git a/sample_data/pasa_example/runMe.sh b/sample_data/pasa_example/runMe.sh
new file mode 100755
index 0000000..6470d43
--- /dev/null
+++ b/sample_data/pasa_example/runMe.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+if [ ! -e genome.fasta ]; then
+    gunzip -c genome.fasta.gz > genome.fasta
+fi
+
+if [ ! -e pasa_assemblies.fasta ]; then
+    gunzip -c pasa_assemblies.fasta.gz > pasa_assemblies.fasta
+fi
+
+if [ ! -e pasa_assemblies.gff3 ]; then
+    gunzip -c pasa_assemblies.gff3.gz > pasa_assemblies.gff3
+fi
+
+../../TransDecoder.LongOrfs -t pasa_assemblies.fasta
+
+../../TransDecoder.Predict -t pasa_assemblies.fasta
+
+../../util/cdna_alignment_orf_to_genome_orf.pl  pasa_assemblies.fasta.transdecoder.gff3 pasa_assemblies.gff3 pasa_assemblies.fasta | tee pasa_assemblies.fasta.transdecoder.genome.gff3
+
+exit 0
diff --git a/sample_data/simple_transcriptome_target/Trinity.fasta.gz b/sample_data/simple_transcriptome_target/Trinity.fasta.gz
new file mode 100644
index 0000000..509769c
Binary files /dev/null and b/sample_data/simple_transcriptome_target/Trinity.fasta.gz differ
diff --git a/sample_data/cleanme.pl b/sample_data/simple_transcriptome_target/cleanme.pl
similarity index 58%
rename from sample_data/cleanme.pl
rename to sample_data/simple_transcriptome_target/cleanme.pl
index f5f72ad..bbe0088 100755
--- a/sample_data/cleanme.pl
+++ b/sample_data/simple_transcriptome_target/cleanme.pl
@@ -11,16 +11,10 @@ chdir $FindBin::Bin or die "error, cannot cd to $FindBin::Bin";
 
 
 
-my @files_to_keep = qw (cleanme.pl 
-                        runMe.sh
-                        test.genome.fasta.gz
-                        test.tophat.sam.gz
-                        transcripts.gtf.gz
-                 
-                 
-
-pfam.domtblout.gz
-blastp.results.outfmt6.gz
+my @files_to_keep = qw (
+cleanme.pl
+runMe.sh
+Trinity.fasta.gz
                         );
 
 
@@ -36,7 +30,7 @@ foreach my $file (<*>) {
 }
 
 
-`rm -rf ./transcripts.fasta.transdecoder_dir/`;
+`rm -rf ./Trinity.fasta.transdecoder_dir/`;
 
 
 exit(0);
diff --git a/sample_data/simple_transcriptome_target/runMe.sh b/sample_data/simple_transcriptome_target/runMe.sh
new file mode 100755
index 0000000..c42569a
--- /dev/null
+++ b/sample_data/simple_transcriptome_target/runMe.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+if [ ! -e Trinity.fasta ]; then
+    gunzip -c Trinity.fasta.gz > Trinity.fasta
+fi
+
+../../TransDecoder.LongOrfs -t Trinity.fasta
+
+../../TransDecoder.Predict -t Trinity.fasta
+
+exit 0
diff --git a/util/cdna_alignment_orf_to_genome_orf.pl b/util/cdna_alignment_orf_to_genome_orf.pl
index 2284f66..4a3b1ee 100755
--- a/util/cdna_alignment_orf_to_genome_orf.pl
+++ b/util/cdna_alignment_orf_to_genome_orf.pl
@@ -2,6 +2,7 @@
 
 use strict;
 use warnings;
+use Carp;
 
 use FindBin;
 use lib ("$FindBin::Bin/../PerlLib");
@@ -16,6 +17,13 @@ my $cdna_orfs_gff3 = $ARGV[0] or die $usage;
 my $cdna_genome_gff3 = $ARGV[1] or die $usage;
 my $cdna_fasta = $ARGV[2] or die $usage;
 
+# ensure can find the inputs
+foreach my $file ($cdna_orfs_gff3, $cdna_genome_gff3, $cdna_fasta) {
+    unless (-s $file) {
+        die "Error, cannot locate file: $file";
+    }
+}
+
 
 my $WARNING_COUNT = 0; # count those orfs that appear to be on strand opposite from the transcribed strand.
 
@@ -37,7 +45,7 @@ main: {
     
         my @gene_ids = @{$contig_to_gene_list_href->{$asmbl_id}};
     
-        foreach my $gene_id (@gene_ids) {
+        foreach my $gene_id (@gene_ids) { # gene identifiers as given by transdecoder on the transcript sequences
             my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id};
             
             my $asmbl_id = $gene_obj_ref->{asmbl_id};
@@ -65,11 +73,14 @@ main: {
                     #$new_orf_gene->{TU_feat_name} = "t.$asmbl_id.$orf_count";
                     #$new_orf_gene->{Model_feat_name} = "m.$asmbl_id.$orf_count";
                     $new_orf_gene->{com_name} = "ORF";
+
+                    my $use_gene_id = $transcript_struct->{gene_id} || $gene_id;
                     
-                    $new_orf_gene->{TU_feat_name} = $gene_id;
+                    
+                    $new_orf_gene->{TU_feat_name} = $use_gene_id;
                     $new_orf_gene->{Model_feat_name} = $gene_obj_ref->{Model_feat_name};
-                                        
-
+                    
+                    
                     print $new_orf_gene->to_GFF3_format(source => "transdecoder") . "\n";
                     
                 }
@@ -111,6 +122,18 @@ sub parse_transcript_alignment_info {
 
         $info =~ /Target=(\S+)/ or die "Error, cannot parse ID from $info";
         my $asmbl = $1;
+
+        my $trans_id = "";
+        my $gene_id = "";
+           
+        
+        if ($asmbl =~ /^GENE\^(\S+),TRANS\^(\S+)/) {
+            $gene_id = $1;
+            $trans_id = $2;
+
+            $asmbl = $2;
+        }
+        
         
         if (my $struct = $cdna_alignments{$asmbl}) {
             push (@{$struct->{coords}}, [$lend, $rend]);
@@ -121,10 +144,14 @@ sub parse_transcript_alignment_info {
                            contig => $contig,
                            
                            coords => [ 
-                                       [$lend, $rend]
-                                     ],
+                               [$lend, $rend]
+                               ],
+                               
                            orient => $orient,
-                                   };
+                               trans_id => $trans_id,
+                               gene_id => $gene_id,
+                               
+            };
 
             $cdna_alignments{$asmbl} = $struct;
         }
@@ -141,7 +168,7 @@ sub parse_transcript_alignment_info {
 sub place_orf_in_cdna_alignment_context {
     my ($transcript_struct, $orf_gene_obj, $cdna_seq_lengths_href) = @_;
 
-    my $trans_seq_length = $cdna_seq_lengths_href->{ $transcript_struct->{asmbl} } or die "Error, no length for " . Dumper($transcript_struct);
+    my $trans_seq_length = $cdna_seq_lengths_href->{ $transcript_struct->{asmbl} } or confess "Error, no length for " . Dumper($transcript_struct) . " Please be sure to use a cDNA fasta file and not a genome fasta file for your commandline parameter.";
     
 
 
diff --git a/util/create_start_PSSM.pl b/util/create_start_PSSM.pl
new file mode 100755
index 0000000..811493c
--- /dev/null
+++ b/util/create_start_PSSM.pl
@@ -0,0 +1,131 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Gene_obj;
+use Gene_obj_indexer;
+use GFF3_utils;
+use Carp;
+use Fasta_retriever;
+use Nuc_translator;
+
+$|++;
+
+my $usage = "\n\nusage: $0 gene_db.inx_file transcripts.fasta train_accs.list pre_length post_length\n\n";
+
+my $inx_file = $ARGV[0] or die $usage;
+my $transcripts_fasta = $ARGV[1] or die $usage;
+my $train_accs_file = $ARGV[2] or die $usage;
+my $pre_length = $ARGV[3] or die $usage;
+my $post_length = $ARGV[4] or die $usage;
+
+
+main: {
+
+    my $gene_obj_indexer = new Gene_obj_indexer( { "use" => "$inx_file" } );
+
+    my $fasta_retriever = new Fasta_retriever($transcripts_fasta);
+
+    my @train_accs = `cat $train_accs_file`;
+    chomp @train_accs;
+    
+    my @nuc_frequency;
+    
+    my $seq_counter = 0;
+
+    foreach my $gene_id (@train_accs) {
+
+        my $gene_obj = $gene_obj_indexer->get_gene($gene_id);
+        
+        my $com_name = $gene_obj->{com_name};
+
+        unless ($com_name =~ /type:complete/ || $com_name =~ /type:3prime_partial/) { next; } # need start codon
+
+
+        my $orient = $gene_obj->get_orientation();
+        my ($lend, $rend) = sort {$a<=>$b} $gene_obj->get_model_span();
+        my $contig_id = $gene_obj->{asmbl_id};
+        
+        my $trans_seq = uc $fasta_retriever->get_seq($contig_id);
+
+        my $seq_length = length($trans_seq);
+        
+        if ($orient eq '-') {
+            $trans_seq = &reverse_complement($trans_seq);
+            $lend = $seq_length - $rend + 1;
+        }
+
+        my $atg = substr($trans_seq, $lend-1, 3);
+        unless ($atg eq "ATG") {
+            die "Error, missing ATG start: $atg ";
+        }
+        
+        my $begin_profile = $lend - 1 - $pre_length;
+        my $profile_length = $pre_length + 3 + $post_length;
+
+        if ($begin_profile >= 1) {
+            my $start_codon_context_substring = substr($trans_seq, $begin_profile, $profile_length);
+            #print "$start_codon_context_substring\n";
+            &add_to_profile(\@nuc_frequency, $start_codon_context_substring);
+            $seq_counter++;
+        }
+        
+        
+        
+    }
+ 
+
+    &write_PSSM($pre_length, $seq_counter, \@nuc_frequency);
+    
+   
+    exit(0);
+}
+
+
+####
+sub add_to_profile {
+    my ($nuc_freq_aref, $context_string) = @_;
+
+    my @chars = split(//, $context_string);
+
+    for (my $i = 0; $i <= $#chars; $i++) {
+        my $char = $chars[$i];
+        $nuc_freq_aref->[$i]->{$char}++;
+    }
+    
+    return;
+}
+
+
+####
+sub write_PSSM {
+    my ($pre_length, $num_seqs, $nuc_freq_aref) = @_;
+    
+    my @nucs = qw(G A T C);
+    
+    my $atg_pos = $pre_length + 1;
+    print "#ATG=$atg_pos\n";
+    print "#" . join("\t", @nucs) . "\n";
+    
+    for (my $i = 0; $i < $#$nuc_freq_aref; $i++) {
+
+        my %char_counts = %{$nuc_freq_aref->[$i]};
+        
+        my @rel_freqs;
+        
+        foreach my $nuc (@nucs) {
+            my $count = $char_counts{$nuc} || 0;
+
+            my $relative_freq = sprintf("%.3f", $count/$num_seqs);
+
+            push (@rel_freqs, $relative_freq);
+        }
+
+        print join("\t", @rel_freqs) . "\n";
+    }
+
+    return;
+}
diff --git a/util/cufflinks_gtf_to_alignment_gff3.pl b/util/cufflinks_gtf_to_alignment_gff3.pl
index 7d15125..f76d628 100755
--- a/util/cufflinks_gtf_to_alignment_gff3.pl
+++ b/util/cufflinks_gtf_to_alignment_gff3.pl
@@ -77,20 +77,22 @@ main: {
 				my $gene_obj = new Gene_obj();
 
 				$gene_obj->{TU_feat_name} = $gene_id;
-				$gene_obj->{Model_feat_name} = $trans_id;
+				$gene_obj->{Model_feat_name} = "$trans_id";
 				$gene_obj->{com_name} = "cufflinks $gene_id $trans_id";
 				
 				$gene_obj->{asmbl_id} = $scaff;
 				
 				$gene_obj->populate_gene_object($coords_href, $coords_href);
-                
-				print $gene_obj->to_alignment_GFF3_format($trans_id, "$trans_id", "Cufflinks");
+
+
+                ## encode gene and trans id in the ID and target fields for later extraction.  (probably a better way to do this!!)
+				print $gene_obj->to_alignment_GFF3_format("GENE^$gene_id,TRANS^$trans_id", "GENE^$gene_id,TRANS^$trans_id", "Cufflinks");
 				
 				print "\n";
 			}
 		}
 	}
-
+    
 
 	exit(0);
 }
diff --git a/util/cufflinks_gtf_to_bed.pl b/util/cufflinks_gtf_to_bed.pl
index 28547ba..8bbb4ed 100755
--- a/util/cufflinks_gtf_to_bed.pl
+++ b/util/cufflinks_gtf_to_bed.pl
@@ -68,11 +68,11 @@ main: {
 
 		my $genes_href = $genome_trans_to_coords{$scaff};
 
-		foreach my $gene_id (keys %$genes_href) {
+		foreach my $gene_id (sort keys %$genes_href) {
 
 			my $trans_href = $genes_href->{$gene_id};
 
-			foreach my $trans_id (keys %$trans_href) {
+			foreach my $trans_id (sort keys %$trans_href) {
 
 				my $coords_href = $trans_href->{$trans_id};
 
diff --git a/util/extract_FL_subset.pl b/util/extract_FL_subset.pl
new file mode 100755
index 0000000..93b7c76
--- /dev/null
+++ b/util/extract_FL_subset.pl
@@ -0,0 +1,134 @@
+#!/usr/bin/env perl
+
+=pod
+
+=head1 NAME
+
+extract_FL_subset.pl
+
+=head1 USAGE
+
+Required:
+
+ --transcripts, -t <string>                            transcripts.fasta
+
+ --gff3, -g <string>                            transcripts.fasta.transdecoder.gff3
+
+=cut
+
+
+use strict;
+use warnings;
+use FindBin;
+use Pod::Usage;
+use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
+use Data::Dumper;
+use File::Basename;
+
+use lib ("$FindBin::RealBin/../PerlLib");
+use Gene_obj;
+
+
+my $UTIL_DIR = "$FindBin::RealBin";
+
+my $help;
+
+my $transcripts_file;
+my $gff3_file;
+
+&GetOptions( 'transcripts|t=s' => \$transcripts_file,
+             'gff3|g=s' => \$gff3_file,
+
+             'h' => \$help,
+    );
+
+
+
+pod2usage(-verbose => 2, -output => \*STDERR) if ($help || ! ($transcripts_file && $gff3_file));
+
+if (@ARGV) {
+    die "Error, don't understand options: @ARGV";
+}
+
+
+main: {
+    
+    my $output_prefix = basename($transcripts_file);
+
+    # get the FL entries:
+    my $FL_accs_file = "$output_prefix.complete_only";
+    my $cmd = "$UTIL_DIR/get_FL_accs.pl $gff3_file > $FL_accs_file";
+    &process_cmd($cmd);
+
+    # index the current gff file:
+    $cmd = "$UTIL_DIR/index_gff3_files_by_isoform.pl $gff3_file";
+    &process_cmd($cmd);
+        
+    # retrieve the best entries:
+    my $complete_gff3_filename = $FL_accs_file . ".gff3";
+    $cmd = "$UTIL_DIR/gene_list_to_gff.pl $FL_accs_file $gff3_file.inx > $FL_accs_file.gff3";
+    &process_cmd($cmd);
+    
+    
+    ##############################
+    ## Generate the final outputs.
+    ##############################
+    
+    {
+                        
+        ## write final outputs:
+        $gff3_file = "$FL_accs_file.gff3";
+        
+
+        ## make a BED file for viewing in IGV
+        my $bed_file = $gff3_file;
+        $bed_file =~ s/\.gff3$/\.bed/;
+        $cmd = "$UTIL_DIR/gff3_file_to_bed.pl $gff3_file > $bed_file";
+        &process_cmd($cmd);
+        
+    
+        # make a peptide file:
+        my $best_pep_file = $gff3_file;
+        $best_pep_file =~ s/\.gff3$/\.pep/;
+        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl $gff3_file $transcripts_file > $best_pep_file";
+        &process_cmd($cmd);
+        
+        
+
+        # make a CDS file:
+        my $best_cds_file = $best_pep_file;
+        $best_cds_file =~ s/\.pep$/\.cds/;
+        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl $gff3_file $transcripts_file CDS > $best_cds_file";
+        &process_cmd($cmd);
+        
+        # make a CDS file:
+        my $best_cdna_file = $best_pep_file;
+        $best_cdna_file =~ s/\.pep$/\.mRNA/;
+        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl $gff3_file $transcripts_file cDNA > $best_cdna_file";
+        &process_cmd($cmd);
+    
+    }
+    
+    print STDERR "Done. See output files $FL_accs_file.\*\n\n\n";
+    
+    
+    
+    exit(0);
+}
+
+
+####
+sub process_cmd {
+	my ($cmd) = @_;
+
+	print "CMD: $cmd\n";
+	my $ret = system($cmd);
+
+	if ($ret) {
+		die "Error, cmd: $cmd died with ret $ret";
+	}
+	
+	return;
+
+}
+
diff --git a/util/get_FL_accs.pl b/util/get_FL_accs.pl
new file mode 100755
index 0000000..6620846
--- /dev/null
+++ b/util/get_FL_accs.pl
@@ -0,0 +1,31 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $usage = "\n\n\tusage: $0 transcripts.fasta.transdecoder.gff3\n\n\n";
+
+my $gff3_file = $ARGV[0] or die $usage;
+
+
+main: {
+
+    open (my $fh, $gff3_file) or die $usage;
+    while (<$fh>) {
+        unless (/\w/) { next; }
+        my @x = split(/\t/);
+        my $type = $x[2];
+        if ($type eq 'mRNA') {
+            my $info = $x[8];
+            if ($info =~ /complete/) {
+                $info =~ /ID=([^;]+);/ or die "Error, cannot parse ID from $info";
+                my $isoform_id = $1;
+                print "$isoform_id\n";
+            }
+        }
+    }
+    close $fh;
+
+    exit(0);
+}
+
diff --git a/util/get_longest_ORF_per_transcript.pl b/util/get_longest_ORF_per_transcript.pl
new file mode 100755
index 0000000..2d6498f
--- /dev/null
+++ b/util/get_longest_ORF_per_transcript.pl
@@ -0,0 +1,63 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Fasta_reader;
+
+my $usage = "\n\n\tusage: $0 file.transdecoder.pep\n\n";
+
+my $pep_file = $ARGV[0] or die $usage;
+
+ main: {
+
+     my %contig_to_longest_prot;
+     
+     my $fasta_reader = new Fasta_reader($pep_file);
+     while (my $seq_obj = $fasta_reader->next()) {
+         my $header = $seq_obj->get_header();
+
+         #  len:365 (+) asmbl_104:2-1096(+)
+         if ($header =~ /len:(\d+) \([+-]\) (\S+):\d+-\d+\([+-]\)/) {
+             my $len = $1;
+             my $contig = $2;
+
+
+             &store_entry(\%contig_to_longest_prot, $seq_obj, $len, $contig);
+         }
+         else {
+             die "Error, cannot parse header info: $header ";
+         }
+     }
+
+     foreach my $struct (values %contig_to_longest_prot) {
+         my $fa = $struct->{seq_obj}->get_FASTA_format();
+
+         print $fa;
+     }
+
+     exit(0);
+}
+
+####
+sub store_entry {
+    my ($contig_to_longest_prot_href, $seq_obj, $len, $contig) = @_;
+
+    my $struct = $contig_to_longest_prot_href->{$contig};
+    
+    if ( (! $struct) || $struct->{length} < $len) {
+        
+        $struct = { seq_obj => $seq_obj,
+                    length => $len,
+        };
+
+        $contig_to_longest_prot_href->{$contig} = $struct;
+    }
+
+    return;
+}
+
+
+    
-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/transdecoder.git
    
    
More information about the debian-med-commit
mailing list