[med-svn] [transdecoder] 01/01: Imported Upstream version 3.0.0+dfsg

Michael Crusoe misterc-guest at moszumanska.debian.org
Fri May 20 07:50:00 UTC 2016


This is an automated email from the git hooks/post-receive script.

misterc-guest pushed a commit to annotated tag upstream/3.0.0+dfsg
in repository transdecoder.

commit f9ac657db54bd4d1eb376b80ea796e2e4300cfc3
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date:   Wed May 18 02:30:59 2016 -0700

    Imported Upstream version 3.0.0+dfsg
---
 .gitmodules                                        |   3 +
 PerlLib/Gene_obj.pm                                |  20 +++
 TransDecoder.LongOrfs                              |  81 +++++++--
 TransDecoder.Predict                               |  99 ++++++++---
 notes                                              |   3 +
 sample_data/Makefile                               |  14 ++
 sample_data/cufflinks_example/Makefile             |   6 +
 sample_data/cufflinks_example/blastp.outfmt6.gz    | Bin 0 -> 1957 bytes
 .../cufflinks_example/blastp.results.outfmt6.gz    | Bin 1808 -> 0 bytes
 sample_data/cufflinks_example/cleanme.pl           |   7 +-
 sample_data/cufflinks_example/pfam.domtblout.gz    | Bin 12838 -> 12460 bytes
 sample_data/cufflinks_example/runMe.sh             |  41 +++--
 sample_data/pasa_example/Makefile                  |   6 +
 sample_data/pasa_example/cleanme.pl                |   2 +
 sample_data/pasa_example/pasa_assemblies.fasta.gz  | Bin 304896 -> 304706 bytes
 sample_data/pasa_example/pasa_assemblies.gff3.gz   | Bin 55455 -> 55467 bytes
 .../pasa_example/pasa_assemblies_described.txt.gz  | Bin 0 -> 231156 bytes
 sample_data/pasa_example/runMe.sh                  |  17 +-
 sample_data/simple_transcriptome_target/Makefile   |   6 +
 sample_data/simple_transcriptome_target/cleanme.pl |   1 +
 sample_data/simple_transcriptome_target/runMe.sh   |   4 +-
 util/cdna_alignment_orf_to_genome_orf.pl           |  62 ++++---
 util/remove_eclipsed_ORFs.pl                       |   4 +-
 util/single_best_ORF_per_transcript.pl             | 187 +++++++++++++++++++++
 24 files changed, 472 insertions(+), 91 deletions(-)

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..70b3ae8
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "TransDecoder.lrgTests"]
+	path = TransDecoder.lrgTests
+	url = https://github.com/TransDecoder/TransDecoder.lrgTests.git
diff --git a/PerlLib/Gene_obj.pm b/PerlLib/Gene_obj.pm
index 990011c..86ea2e4 100755
--- a/PerlLib/Gene_obj.pm
+++ b/PerlLib/Gene_obj.pm
@@ -4450,6 +4450,26 @@ sub has_CDS {
 }
 
 
+####
+sub set_CDS_phases_from_init_phase {
+    my ($self, $init_phase) = @_;
+
+    my @exons = $self->get_exons();
+
+    my $curr_cds_len = $init_phase;
+    
+    foreach my $exon (@exons) {
+        if (my $cds = $exon->get_CDS_obj()) {
+            $cds->set_phase($curr_cds_len % 3);
+            my $cds_len = $cds->length();
+            $curr_cds_len += $cds_len;
+        }
+    }
+
+    return;
+}
+
+
 
 sub set_CDS_phases {
     my ($self, $genomic_seq_ref) = @_;
diff --git a/TransDecoder.LongOrfs b/TransDecoder.LongOrfs
index 68b30a4..8935687 100755
--- a/TransDecoder.LongOrfs
+++ b/TransDecoder.LongOrfs
@@ -16,6 +16,8 @@ Required:
 
 Optional:
 
+ --gene_trans_map <string>              gene-to-transcript identifier mapping file (tab-delimited, gene_id<tab>trans_id<return> ) 
+
  -m <int>                               minimum protein length (default: 100)
  
  -G <string>                            genetic code (default: universal; see PerlDoc; options: Euplotes, Tetrahymena, Candida, Acetabularia)
@@ -83,7 +85,7 @@ my $workdir;
 my $verbose;
 my $search_pfam = "";
 my ($reuse,$pfam_out);
-
+my $gene_trans_map_file;
 
 my $MPI_DEBUG = 1;
 
@@ -93,8 +95,9 @@ my $MPI_DEBUG = 1;
              'h' => \$help,
              'v' => \$verbose,
              'S' => \$TOP_STRAND_ONLY, 
-             'p=i' => \$preserve_full_lengths
-             );
+             'p=i' => \$preserve_full_lengths,
+             'gene_trans_map=s' => \$gene_trans_map_file,
+    );
 
 
 
@@ -116,6 +119,20 @@ if ($genetic_code ne 'universal') {
 
 
 main: {
+
+    my %gene_trans_map;
+    if ($gene_trans_map_file) {
+        open(my $fh, $gene_trans_map_file) or die "Error, cannot open file $gene_trans_map_file";
+        while (<$fh>) {
+            chomp;
+            my ($gene_id, $trans_id) = split(/\t/);
+            $gene_trans_map{$trans_id} = $gene_id;
+        }
+        close $fh;
+    }
+        
+           
+
     my $workdir = basename($transcripts_file) . ".transdecoder_dir"; 
     unless (-d $workdir) {
         mkdir($workdir) or die "Error, cannot mkdir $workdir.";
@@ -175,9 +192,11 @@ main: {
 		my @orf_structs = $longest_orf_finder->capture_all_ORFs($sequence);
 		
 		@orf_structs = reverse sort {$a->{length}<=>$b->{length}} @orf_structs;
-    print "checking for fp in $acc \n" if ($SEE);
-    $longest_orf_finder->check_for_fp_5prime_partials($MIN_PROT_LENGTH,\@orf_structs,$preserve_full_lengths) 
-       if(defined($preserve_full_lengths));
+        print "checking for fp in $acc \n" if ($SEE);
+        
+        if (defined($preserve_full_lengths)) {
+            $longest_orf_finder->check_for_fp_5prime_partials($MIN_PROT_LENGTH,\@orf_structs,$preserve_full_lengths) 
+        }
 		
         while (@orf_structs) {
             my $orf = shift @orf_structs;
@@ -229,19 +248,32 @@ main: {
             
             $model_counter++;
 
-            my $model_id = "$acc|m.$model_counter";
-            my $gene_id = "$acc|g.$model_counter";
+
+            my $gene_id;
+            if (%gene_trans_map) {
+                $gene_id = $gene_trans_map{$acc} or die "Error, cannot locate gene identifier for transcript acc: [$acc]";
+            }
+            elsif (my $parsed_gene_id = &try_parse_gene_id_from_acc($acc)) {
+                $gene_id = $parsed_gene_id;
+            }
+            else {
+                $gene_id = "Gene.$model_counter";
+            }
             
+            ## gene ID and model ID must be unique for each entry, but also decipherable for later on.
+            $gene_id = "${gene_id}::${acc}::g.$model_counter";
+            my $model_id = "${gene_id}::m.$model_counter";
+                        
             $gene_obj->{TU_feat_name} = $gene_id;
             $gene_obj->{Model_feat_name} = $model_id;
-
-            
+                        
             my $cds = $gene_obj->create_CDS_sequence(\$sequence);
-
+            $gene_obj->set_CDS_phases(\$sequence);
+            
             unless ($cds) {
                 die "Error, no CDS for gene: " . Dumper($cds_coords_href) . Dumper($exon_coords_href);
             }
-
+            
             my $got_start = 0;
             my $got_stop = 0;
             if ($protein =~ /^M/) {
@@ -262,22 +294,24 @@ main: {
                 $prot_type = "internal";
             }
             
-            $gene_obj->{com_name} = "ORF $gene_id $model_id type:$prot_type len:$length ($orient)";            
+            $gene_obj->{com_name} = "ORF type:$prot_type len:$length ($orient)";            
             
             # this header is identical between CDS and PEP (since PEP is just a direct translation of CDS for a specific translation table)
             # we are currently not printing this out at the final data but it would be nice to.
-            my $pep_header = ">$model_id $gene_id type:$prot_type len:$length gc:$genetic_code $acc:$start-$stop($orient)\n";
-            my $cds_header = ">$model_id $gene_id type:$prot_type len:$length $acc:$start-$stop($orient)\n";
+            my $pep_header = ">$model_id type:$prot_type len:$length gc:$genetic_code $acc:$start-$stop($orient)\n";
+            my $cds_header = ">$model_id type:$prot_type len:$length $acc:$start-$stop($orient)\n";
             
             print PEP $pep_header."$protein\n";
                         
             print CDS $cds_header."$cds\n";
+
+            
             
             print GFF $gene_obj->to_GFF3_format(source => "transdecoder") . "\n";
             
         }
 	}
-
+    
     close PEP;
     close CDS;
     close GFF;
@@ -313,3 +347,18 @@ sub process_cmd {
 
 }
 
+####
+sub try_parse_gene_id_from_acc {
+    my ($acc) = @_;
+
+    my $gene_id;
+    
+    if ($acc =~ /^(\S+_c\d+_g\d+)_i\d+$/) {
+        $gene_id = $1;
+    }
+    elsif ($acc =~ /^(\S+_c\d+)_seq\d+$/) {
+        $gene_id = $1;
+    }
+
+    return($gene_id);
+}
diff --git a/TransDecoder.Predict b/TransDecoder.Predict
index a94080a..838aea4 100755
--- a/TransDecoder.Predict
+++ b/TransDecoder.Predict
@@ -17,10 +17,14 @@ Common options:
  --retain_long_orfs <int>               retain all ORFs found that are equal or longer than these many nucleotides even if no other evidence 
                                          marks it as coding (default: 900 bp => 300aa)
 
- --retain_pfam_hits <string>                 /path/to/pfam_db.hmm to search 
-                                        using hmmscan (which should be accessible via your PATH setting)
+ --retain_pfam_hits <string>            domain table output file from running hmmscan to search Pfam (see transdecoder.github.io for info)     
+                                        Any ORF with a pfam domain hit will be retained in the final output.
  
- --retain_blastp_hits <string>
+ --retain_blastp_hits <string>          blastp output in '-outfmt 6' format.
+                                        Any ORF with a blast match will be retained in the final output.
+
+ --single_best_orf                      Retain only the single best ORF per transcript.
+                                        (Best is defined as having (optionally pfam and/or blast support) and longest orf)
 
  --cpu <int>                            Use multipe cores for cd-hit-est. (default=1)
 
@@ -78,6 +82,7 @@ my $retain_pfam_hits_file;
 my $retain_blastp_hits_file;
 my $cpu = 1;
 my $MPI_DEBUG = 1;
+my $single_best_orf_flag = 0;
 
 &GetOptions( 't=s' => \$transcripts_file,
              'train:s' => \$train_file,
@@ -97,6 +102,8 @@ my $MPI_DEBUG = 1;
              'retain_pfam_hits=s' => \$retain_pfam_hits_file,
              'retain_blastp_hits=s' => \$retain_blastp_hits_file,
              'cpu=i' => \$cpu,
+
+             'single_best_orf' => \$single_best_orf_flag,
              
              );
 
@@ -192,35 +199,59 @@ main: {
     # get accs for best entries
     my $acc_file = "$cds_file.scores.selected";
     {
+        
+        my %att_counter;
+        
         open (my $ofh, ">$acc_file") or die "Error, cannot write to $acc_file";
         open (my $ifh, "$cds_file.scores") or die "Error, cannot open file $cds_file.scores";
         while (<$ifh>) {
             chomp;
             my ($acc, $orf_length, @scores) = split(/\t/);
+
+            my @ATTS;
             
             my $score_1 = shift @scores;
             my $max_score_other_frame = max(@scores);
-            if ($has_pfam_hit{$acc} 
-                || 
-                $has_blastp_hit{$acc}
-                ||
-                $orf_length >= $RETAIN_LONG_ORFS
-                ||
-                ($score_1 > 0 && $score_1 > $max_score_other_frame)
-                ) { 
+
+            my $keep_flag = 0;
+
+            if ($has_pfam_hit{$acc}) {
+                $keep_flag = 1;
+                push (@ATTS, "PFAM");
+                print STDERR "-$acc flagged as having a pfam domain.\n" if $verbose;
+            }
+            if ($has_blastp_hit{$acc}) {
+                $keep_flag = 1;
+                push (@ATTS, "BLASTP");
+                print STDERR "-$acc flagged as having a blastp match.\n" if $verbose;
+            }
+            if ($orf_length >= $RETAIN_LONG_ORFS) {
+                $keep_flag = 1;
+                push (@ATTS, "LONGORF");
+            }
+            if ($score_1 > 0 && $score_1 > $max_score_other_frame) {
+                $keep_flag = 1;
+                push (@ATTS, "FRAMESCORE");
+            }
+            
+            if ($keep_flag) {
                 print $ofh "$acc\n";
-                
-                if ($has_pfam_hit{$acc}) {
-                    print STDERR "-$acc flagged as having a pfam domain.\n" if $verbose;
-                }
-                if ($has_blastp_hit{$acc}) {
-                    print STDERR "-$acc flagged as having a blastp match.\n" if $verbose;
-                }
-                
+
+                my $att_string = join("|", sort @ATTS);
+                $att_counter{$att_string}++;
             }
         }
         close $ifh;
         close $ofh;
+    
+
+        # report on the categories of the selected ORFs
+        print STDERR "\n#####################\nCounts of kept entries according to attributes:\n";
+        foreach my $att (reverse sort {$att_counter{$a}<=>$att_counter{$b}} keys %att_counter) {
+            my $count = $att_counter{$att};
+            print STDERR join("\t", $att, $count) . "\n";
+        }
+        print STDERR "########################\n\n\n";
     }
     
     # index the current gff file:
@@ -241,8 +272,30 @@ main: {
     {
                         
         # exclude shadow orfs (smaller orfs in different reading frame that are eclipsed by longer orfs)
-        $cmd = "$UTIL_DIR/remove_eclipsed_ORFs.pl $cds_file.best_candidates.gff3 > $final_output_prefix.gff3";
+        $cmd = "$UTIL_DIR/remove_eclipsed_ORFs.pl $cds_file.best_candidates.gff3 > $cds_file.eclipsed_removed.gff3";
+
+        my $target_final_file = "$cds_file.eclipsed_removed.gff3";
+        
         &process_cmd($cmd);
+
+        
+        if ($single_best_orf_flag) {
+            $cmd = "$UTIL_DIR/single_best_ORF_per_transcript.pl ";
+            if ($retain_blastp_hits_file) {
+                $cmd .= " --blast_hits $retain_blastp_hits_file ";
+            }
+            if ($retain_pfam_hits_file) {
+                $cmd .= " --pfam_hits $retain_pfam_hits_file ";
+            }
+            $cmd .= " --gff3_file $cds_file.eclipsed_removed.gff3 > $cds_file.single_best_orf.gff3";
+            
+            &process_cmd($cmd);
+
+            $target_final_file = "$cds_file.single_best_orf.gff3";
+        }
+
+        
+        &process_cmd("cp $target_final_file $final_output_prefix.gff3");
                 
         ## write final outputs:
         
@@ -268,12 +321,6 @@ main: {
         $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl $gff3_file $transcripts_file CDS > $best_cds_file";
         &process_cmd($cmd);
         
-        # make a CDS file:
-        my $best_cdna_file = $best_pep_file;
-        $best_cdna_file =~ s/\.pep$/\.mRNA/;
-        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl $gff3_file $transcripts_file cDNA > $best_cdna_file";
-        &process_cmd($cmd);
-    
     }
     
     print STDERR "transdecoder is finished.  See output files $final_output_prefix.\*\n\n\n";
diff --git a/notes b/notes
index 7806c64..8cd2de1 100644
--- a/notes
+++ b/notes
@@ -1 +1,4 @@
 git clone https://github.com/TransDecoder/TransDecoder.git
+
+GeneID paper:
+http://genome.cshlp.org/content/10/4/511.long
diff --git a/sample_data/Makefile b/sample_data/Makefile
new file mode 100644
index 0000000..451b9a5
--- /dev/null
+++ b/sample_data/Makefile
@@ -0,0 +1,14 @@
+
+DIRS=cufflinks_example pasa_example simple_transcriptome_target
+
+test:
+	@for i in $(DIRS); do \
+	echo "Running example in $$$i..."; \
+	(cd $$i; $(MAKE) test) || exit $$?; done
+
+
+clean:
+	@for i in $(DIRS); do \
+	echo "Running example in $$i..."; \
+	(cd $$i; $(MAKE) clean) || exit $$?; done
+
diff --git a/sample_data/cufflinks_example/Makefile b/sample_data/cufflinks_example/Makefile
new file mode 100644
index 0000000..4ab2c6d
--- /dev/null
+++ b/sample_data/cufflinks_example/Makefile
@@ -0,0 +1,6 @@
+test:
+	./runMe.sh
+
+clean:
+	./cleanme.pl
+
diff --git a/sample_data/cufflinks_example/blastp.outfmt6.gz b/sample_data/cufflinks_example/blastp.outfmt6.gz
new file mode 100644
index 0000000..e3824ef
Binary files /dev/null and b/sample_data/cufflinks_example/blastp.outfmt6.gz differ
diff --git a/sample_data/cufflinks_example/blastp.results.outfmt6.gz b/sample_data/cufflinks_example/blastp.results.outfmt6.gz
deleted file mode 100644
index 09e79bf..0000000
Binary files a/sample_data/cufflinks_example/blastp.results.outfmt6.gz and /dev/null differ
diff --git a/sample_data/cufflinks_example/cleanme.pl b/sample_data/cufflinks_example/cleanme.pl
index f5f72ad..1dfa3d9 100755
--- a/sample_data/cufflinks_example/cleanme.pl
+++ b/sample_data/cufflinks_example/cleanme.pl
@@ -17,11 +17,12 @@ my @files_to_keep = qw (cleanme.pl
                         test.tophat.sam.gz
                         transcripts.gtf.gz
                  
-                 
 
+blastp.outfmt6.gz
 pfam.domtblout.gz
-blastp.results.outfmt6.gz
-                        );
+Makefile
+
+                                         );
 
 
 my %keep = map { + $_ => 1 } @files_to_keep;
diff --git a/sample_data/cufflinks_example/pfam.domtblout.gz b/sample_data/cufflinks_example/pfam.domtblout.gz
index 9b7af55..7a22fa6 100644
Binary files a/sample_data/cufflinks_example/pfam.domtblout.gz and b/sample_data/cufflinks_example/pfam.domtblout.gz differ
diff --git a/sample_data/cufflinks_example/runMe.sh b/sample_data/cufflinks_example/runMe.sh
index fc27d70..450faa2 100755
--- a/sample_data/cufflinks_example/runMe.sh
+++ b/sample_data/cufflinks_example/runMe.sh
@@ -1,22 +1,21 @@
-#!/bin/bash -e
+#!/bin/bash -ve
 
-if [ -e test.genome.fasta.gz ] && [ ! -e test.genome.fasta ]; then
+export PERL_HASH_SEED=0
+
+if [ ! -e test.genome.fasta ]; then
     gunzip -c test.genome.fasta.gz > test.genome.fasta
 fi
 
-if [ -e test.tophat.sam.gz ] && [ ! -e test.tophat.sam ]; then
-    gunzip -c test.tophat.sam.gz > test.tophat.sam
-fi
 
-if [ -e transcripts.gtf.gz ] && [ ! -e transcripts.gtf ]; then
+if [ ! -e transcripts.gtf ]; then
     gunzip -c transcripts.gtf.gz > transcripts.gtf
 fi
 
-if [ -e blastp.results.outfmt6.gz ] && [ ! -e blastp.results.outfmt6 ]; then
-    gunzip -c blastp.results.outfmt6.gz > blastp.results.outfmt6
+if [ ! -e blastp.outfmt6 ]; then
+    gunzip -c blastp.outfmt6.gz > blastp.outfmt6
 fi
 
-if [ -e pfam.domtblout.gz ] && [ ! -e pfam.domtblout ]; then
+if [ ! -e pfam.domtblout ]; then
     gunzip -c pfam.domtblout.gz > pfam.domtblout
 fi
 
@@ -32,9 +31,21 @@ fi
 
 
 ## Predict likely ORFs
-if [ $1 ]; then
+if [ 1 ]; then   # always doing this now.
+
+    # this is how I would have run blast and pfam but I'm using precomputed results for ease of demonstration.
+    #BLASTDB=/seq/RNASEQ/DBs/TRINOTATE_RESOURCES/TRINOTATE_V3/uniprot_sprot.pep
+    #PFAMDB=/seq/RNASEQ/DBs/TRINOTATE_RESOURCES/TRINOTATE_V3/Pfam-A.hmm
+    #
+    ## run blast
+    #blastp -query transcripts.fasta.transdecoder_dir/longest_orfs.pep -db $BLASTDB -max_target_seqs 1 -outfmt 6 -evalue 1e-5 > blastp.outfmt6
+    #
+    ## run pfam
+    #hmmscan --domtblout pfam.domtblout $PFAMDB transcripts.fasta.transdecoder_dir/longest_orfs.pep > pfam.log
+    
+    
     ## use pfam and blast results:
-    ../../TransDecoder.Predict  -t transcripts.fasta --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.results.outfmt6 -v
+    ../../TransDecoder.Predict  -t transcripts.fasta --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.outfmt6 --single_best_orf  -v
 else
     # just coding metrics
     ../../TransDecoder.Predict -t transcripts.fasta 
@@ -52,10 +63,8 @@ fi
 # convert the genome-based gene-gff3 file to bed
 ../../util/gff3_file_to_bed.pl transcripts.fasta.transdecoder.genome.gff3 > transcripts.fasta.transdecoder.genome.bed
 
-echo
-echo
-echo Done!  Coding region genome annotations provided as: best_candidates.eclipsed_orfs_removed.genome.gff3
-echo
-echo 
+
+# Done!  Coding region genome annotations provided as: transcripts.fasta.transdecoder.genome.\*
+
 
 exit 0
diff --git a/sample_data/pasa_example/Makefile b/sample_data/pasa_example/Makefile
new file mode 100644
index 0000000..4ab2c6d
--- /dev/null
+++ b/sample_data/pasa_example/Makefile
@@ -0,0 +1,6 @@
+test:
+	./runMe.sh
+
+clean:
+	./cleanme.pl
+
diff --git a/sample_data/pasa_example/cleanme.pl b/sample_data/pasa_example/cleanme.pl
index 88f5233..2278c9a 100755
--- a/sample_data/pasa_example/cleanme.pl
+++ b/sample_data/pasa_example/cleanme.pl
@@ -17,6 +17,8 @@ genome.fasta.gz
 pasa_assemblies.fasta.gz
 pasa_assemblies.gff3.gz
 runMe.sh
+pasa_assemblies_described.txt.gz
+Makefile
                         );
 
 
diff --git a/sample_data/pasa_example/pasa_assemblies.fasta.gz b/sample_data/pasa_example/pasa_assemblies.fasta.gz
index 3eca927..0da5588 100644
Binary files a/sample_data/pasa_example/pasa_assemblies.fasta.gz and b/sample_data/pasa_example/pasa_assemblies.fasta.gz differ
diff --git a/sample_data/pasa_example/pasa_assemblies.gff3.gz b/sample_data/pasa_example/pasa_assemblies.gff3.gz
index bdb2aac..c635200 100644
Binary files a/sample_data/pasa_example/pasa_assemblies.gff3.gz and b/sample_data/pasa_example/pasa_assemblies.gff3.gz differ
diff --git a/sample_data/pasa_example/pasa_assemblies_described.txt.gz b/sample_data/pasa_example/pasa_assemblies_described.txt.gz
new file mode 100644
index 0000000..4e29025
Binary files /dev/null and b/sample_data/pasa_example/pasa_assemblies_described.txt.gz differ
diff --git a/sample_data/pasa_example/runMe.sh b/sample_data/pasa_example/runMe.sh
index 6470d43..4a625c1 100755
--- a/sample_data/pasa_example/runMe.sh
+++ b/sample_data/pasa_example/runMe.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/bash -ve
 
 if [ ! -e genome.fasta ]; then
     gunzip -c genome.fasta.gz > genome.fasta
@@ -12,10 +12,21 @@ if [ ! -e pasa_assemblies.gff3 ]; then
     gunzip -c pasa_assemblies.gff3.gz > pasa_assemblies.gff3
 fi
 
-../../TransDecoder.LongOrfs -t pasa_assemblies.fasta
+if [ ! -e pasa_assemblies_described.txt ]; then
+    gunzip -c pasa_assemblies_described.txt.gz > pasa_assemblies_described.txt
+fi
+
+
+# get the gene-to-transcript relationships
+cut -f2,3 pasa_assemblies_described.txt > pasa.gene_trans_map.txt
+
+../../TransDecoder.LongOrfs -t pasa_assemblies.fasta --gene_trans_map pasa.gene_trans_map.txt
 
 ../../TransDecoder.Predict -t pasa_assemblies.fasta
 
-../../util/cdna_alignment_orf_to_genome_orf.pl  pasa_assemblies.fasta.transdecoder.gff3 pasa_assemblies.gff3 pasa_assemblies.fasta | tee pasa_assemblies.fasta.transdecoder.genome.gff3
+../../util/cdna_alignment_orf_to_genome_orf.pl  pasa_assemblies.fasta.transdecoder.gff3 pasa_assemblies.gff3 pasa_assemblies.fasta  >  pasa_assemblies.fasta.transdecoder.genome.gff3
+
+echo "Done.  See pasa_assemblies.fasta.transdecoder.\*"
+
 
 exit 0
diff --git a/sample_data/simple_transcriptome_target/Makefile b/sample_data/simple_transcriptome_target/Makefile
new file mode 100644
index 0000000..4ab2c6d
--- /dev/null
+++ b/sample_data/simple_transcriptome_target/Makefile
@@ -0,0 +1,6 @@
+test:
+	./runMe.sh
+
+clean:
+	./cleanme.pl
+
diff --git a/sample_data/simple_transcriptome_target/cleanme.pl b/sample_data/simple_transcriptome_target/cleanme.pl
index bbe0088..de8482d 100755
--- a/sample_data/simple_transcriptome_target/cleanme.pl
+++ b/sample_data/simple_transcriptome_target/cleanme.pl
@@ -15,6 +15,7 @@ my @files_to_keep = qw (
 cleanme.pl
 runMe.sh
 Trinity.fasta.gz
+Makefile
                         );
 
 
diff --git a/sample_data/simple_transcriptome_target/runMe.sh b/sample_data/simple_transcriptome_target/runMe.sh
index c42569a..3ad9343 100755
--- a/sample_data/simple_transcriptome_target/runMe.sh
+++ b/sample_data/simple_transcriptome_target/runMe.sh
@@ -4,8 +4,8 @@ if [ ! -e Trinity.fasta ]; then
     gunzip -c Trinity.fasta.gz > Trinity.fasta
 fi
 
-../../TransDecoder.LongOrfs -t Trinity.fasta
+../../TransDecoder.LongOrfs -t Trinity.fasta 
 
-../../TransDecoder.Predict -t Trinity.fasta
+../../TransDecoder.Predict -t Trinity.fasta  --single_best_orf 
 
 exit 0
diff --git a/util/cdna_alignment_orf_to_genome_orf.pl b/util/cdna_alignment_orf_to_genome_orf.pl
index 4a3b1ee..84d0d55 100755
--- a/util/cdna_alignment_orf_to_genome_orf.pl
+++ b/util/cdna_alignment_orf_to_genome_orf.pl
@@ -41,28 +41,20 @@ main: {
     ## associate gene identifiers with contig id's.
     my $contig_to_gene_list_href = &GFF3_utils::index_GFF3_gene_objs($cdna_orfs_gff3, $gene_obj_indexer_href);
 
-    foreach my $asmbl_id (sort keys %$contig_to_gene_list_href) {
+
+    my %isolated_gene_id_to_new_genes;
     
+    foreach my $asmbl_id (sort keys %$contig_to_gene_list_href) {
+        ## $asmbl_id is the actual Transcript identifier from which ORFs were predicted.
+        
         my @gene_ids = @{$contig_to_gene_list_href->{$asmbl_id}};
     
         foreach my $gene_id (@gene_ids) { # gene identifiers as given by transdecoder on the transcript sequences
             my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id};
             
-            my $asmbl_id = $gene_obj_ref->{asmbl_id};
-            
-            
-            ## pasa stuff
-            
-            if ($asmbl_id =~ /(S\d+)_(asmbl_\d+)/) { 
-                
-                my $subcluster = $1;
-                $asmbl_id = $2;
-            }
-            
             my $transcript_struct = $cdna_acc_to_transcript_structure{$asmbl_id} or die "Error, no cdna struct for $asmbl_id";
             
-            #print Dumper($transcript_struct) . $gene_obj_ref->toString();
-
+            
             eval {
                 my $new_orf_gene = &place_orf_in_cdna_alignment_context($transcript_struct, $gene_obj_ref, \%cdna_seq_lengths);
                 
@@ -74,14 +66,17 @@ main: {
                     #$new_orf_gene->{Model_feat_name} = "m.$asmbl_id.$orf_count";
                     $new_orf_gene->{com_name} = "ORF";
 
-                    my $use_gene_id = $transcript_struct->{gene_id} || $gene_id;
-                    
-                    
+                    my $use_gene_id = $transcript_struct->{gene_id};
+                    unless ($use_gene_id) {
+                        ## extract the orig gene id from the incoming gene obj
+                        my ($isolated_gene_id, @rest) = split(/::/, $gene_id);
+                        $use_gene_id = $isolated_gene_id;
+                    }
+                                        
                     $new_orf_gene->{TU_feat_name} = $use_gene_id;
                     $new_orf_gene->{Model_feat_name} = $gene_obj_ref->{Model_feat_name};
-                    
-                    
-                    print $new_orf_gene->to_GFF3_format(source => "transdecoder") . "\n";
+
+                    push (@{$isolated_gene_id_to_new_genes{$use_gene_id}}, $new_orf_gene);
                     
                 }
             };
@@ -99,6 +94,25 @@ main: {
         }
     }
 
+    ## output results:
+    foreach my $gene_id (sort keys %isolated_gene_id_to_new_genes) {
+
+        my @gene_objs = @{$isolated_gene_id_to_new_genes{$gene_id}};
+
+        foreach my $gene_obj (@gene_objs) {
+            $gene_obj->set_CDS_phases_from_init_phase(0);
+        }
+
+        my $parent_gene_obj = shift @gene_objs;
+        foreach my $gene_obj (@gene_objs) {
+            $parent_gene_obj->add_isoform($gene_obj);
+        }
+
+        $parent_gene_obj->refine_gene_object();
+        
+        print $parent_gene_obj->to_GFF3_format(source => "transdecoder") . "\n";
+    }
+        
     exit(0);
 
 }
@@ -126,7 +140,7 @@ sub parse_transcript_alignment_info {
         my $trans_id = "";
         my $gene_id = "";
            
-        
+        ## trick for retaining gene and trans identifier information from cufflinks data 
         if ($asmbl =~ /^GENE\^(\S+),TRANS\^(\S+)/) {
             $gene_id = $1;
             $trans_id = $2;
@@ -147,12 +161,12 @@ sub parse_transcript_alignment_info {
                                [$lend, $rend]
                                ],
                                
-                           orient => $orient,
+                               orient => $orient,
                                trans_id => $trans_id,
                                gene_id => $gene_id,
                                
             };
-
+            
             $cdna_alignments{$asmbl} = $struct;
         }
 
@@ -230,7 +244,7 @@ sub place_orf_in_cdna_alignment_context {
         if (scalar(@exon_coords) > 1) {
             # any correct ORF should be in the '+' orientation here.... must be a false positive orf or transcript structure is wrong
             $WARNING_COUNT++;
-            print STDERR "Warning [$WARNING_COUNT], shouldn't have a minus-strand ORF on a spliced transcript structure. Skipping entry.\n";
+            print STDERR "Warning [$WARNING_COUNT], shouldn't have a minus-strand ORF on a spliced transcript structure. Skipping entry $orf_gene_obj->{Model_feat_name}.\n";
             
             return undef;
         }
diff --git a/util/remove_eclipsed_ORFs.pl b/util/remove_eclipsed_ORFs.pl
index 92e7fd1..4cc3e8d 100755
--- a/util/remove_eclipsed_ORFs.pl
+++ b/util/remove_eclipsed_ORFs.pl
@@ -33,7 +33,7 @@ main: {
             
             my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id};
             
-            my ($lend, $rend) = sort {$a<=>$b} $gene_obj_ref->get_coords();
+            my ($lend, $rend) = sort {$a<=>$b} $gene_obj_ref->get_model_span();
             
             my $struct = { gene_obj => $gene_obj_ref,
                            lend => $lend,
@@ -64,6 +64,8 @@ main: {
                 
                 if ($next_lend > $lend && $next_rend < $rend) {
                     ## eclipsed
+                    my $model_feat_name = $gene->{gene_obj}->{Model_feat_name};
+                    print STDERR "\tECLIPSE: $model_feat_name is eclipsed by longer ORF, removing it.\n";
                     $found_eclipsed = 1;
                     last;
                 }
diff --git a/util/single_best_ORF_per_transcript.pl b/util/single_best_ORF_per_transcript.pl
new file mode 100755
index 0000000..f34f3a3
--- /dev/null
+++ b/util/single_best_ORF_per_transcript.pl
@@ -0,0 +1,187 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Gene_obj;
+use Gene_obj_indexer;
+use GFF3_utils;
+use Carp;
+use Data::Dumper;
+use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
+
+
+
+my $usage = <<__EOUSAGE__;
+
+################################################################################
+#
+# Required:
+#
+#  --gff3_file <string>          gff3 file w/ predicted ORFs
+#
+# Optional:
+#
+#  --blast_hits <string>         blast hits file
+#
+#  --pfam_hits <string>          pfam hits file
+#
+################################################################################
+
+
+__EOUSAGE__
+
+    ;
+
+my $gff3_file;
+my $blast_hits_file;
+my $pfam_hits_file;
+
+
+&GetOptions('gff3_file=s' => \$gff3_file,
+            'blast_hits=s' => \$blast_hits_file,
+            'pfam_hits=s' => \$pfam_hits_file,
+    );
+
+unless ($gff3_file) {
+    die $usage;
+}
+
+
+main: {
+
+    my %blast_hits;
+    if ($blast_hits_file) {
+        %blast_hits = &parse_blastp_hits_file($blast_hits_file);
+    }
+
+    my %pfam_hits;
+    if ($pfam_hits_file) {
+        %pfam_hits = &parse_pfam_hits_file($pfam_hits_file);
+    }
+        
+    my $gene_obj_indexer_href = {};
+    
+    my $asmbl_id_to_gene_list_href = &GFF3_utils::index_GFF3_gene_objs($gff3_file, $gene_obj_indexer_href);
+    
+    foreach my $asmbl_id (sort keys %$asmbl_id_to_gene_list_href) {
+        
+        my @gene_ids = @{$asmbl_id_to_gene_list_href->{$asmbl_id}};
+        
+        #print "ASMBL: $asmbl_id, gene_ids: @gene_ids\n";
+        my @gene_entries;
+        
+        foreach my $gene_id (@gene_ids) {
+            
+            my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id};
+
+            my $model_id = $gene_obj_ref->{Model_feat_name};
+
+            my $homology_count = 0;
+            if ($blast_hits{$model_id}) {
+                $homology_count++;
+            }
+            if ($pfam_hits{$model_id}) {
+                $homology_count++;
+            }
+            
+            
+            my $struct = { gene_obj => $gene_obj_ref,
+                           length => $gene_obj_ref->get_CDS_length(),
+                           homology_count => $homology_count,
+            };
+            
+            push (@gene_entries, $struct);
+            
+            
+        }
+        
+        @gene_entries = sort {
+        
+            $b->{homology_count} <=> $a->{homology_count}
+            ||
+            $b->{length} <=> $a->{length}
+            
+        } @gene_entries;
+        
+        if (scalar @gene_entries > 1) {
+            print STDERR "ORFs prioritized as:\n";
+            foreach my $entry (@gene_entries) {
+                print STDERR "\t" . join("\t", $entry->{gene_obj}->{Model_feat_name}, 
+                                         "homology_count: " . $entry->{homology_count},
+                                         "len: " . $entry->{length}) . "\n";
+            }
+            
+        }
+        
+        my $best_gene_entry = shift @gene_entries;
+
+        my $gene_obj = $best_gene_entry->{gene_obj};
+            
+        print $gene_obj->to_GFF3_format(source => "transdecoder") . "\n";
+        
+        foreach my $remaining_gene (@gene_entries) {
+            print STDERR "-discarding " . $remaining_gene->{gene_obj}->{Model_feat_name} . " as not single best orf on trans\n";
+        }
+        
+    }
+    
+    
+    exit(0);
+
+}
+
+
+
+## note borrowed code below from TransDecoder.predict and should consolidate.   ## //FIXME
+
+sub parse_pfam_hits_file {
+    my ($pfam_hits_file) = @_;
+
+    my %has_pfam_hit;
+
+    if (! -e $pfam_hits_file) {
+        die "Error, cannot find pfam hits file: $pfam_hits_file";
+    }
+
+    print "PFAM output found and processing...\n";
+    # capture those proteins having pfam hits
+    open (my $fh, $pfam_hits_file) or die "Error, cannot open file: $pfam_hits_file";
+    while (my $ln=<$fh>) {
+        next if $ln=~/^\#/;
+        my @x = split(/\s+/,$ln);
+        next unless $x[3];  # domtbl
+        my $orf_acc = $x[3];
+        $has_pfam_hit{$orf_acc} = 1;
+    }
+    close $fh;
+
+
+    return(%has_pfam_hit);
+}
+
+####
+sub parse_blastp_hits_file {
+    my ($blastp_file) = @_;
+
+    unless (-e $blastp_file) {
+        die "Error, cannot find file $blastp_file";
+    }
+
+    my %blastp_hits;
+
+    open (my $fh, $blastp_file) or die "Error, cannot open file $blastp_file";
+    while (<$fh>) {
+        chomp;
+        my @x = split(/\t/);
+        my $id = $x[0];
+
+        $blastp_hits{$id} = 1;
+    }
+    close $fh;
+
+    return(%blastp_hits);
+}
+
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/transdecoder.git



More information about the debian-med-commit mailing list