[med-svn] [transdecoder] 01/05: Imported Upstream version 2.0.1

Michael Crusoe misterc-guest at moszumanska.debian.org
Fri Feb 13 12:23:10 UTC 2015


This is an automated email from the git hooks/post-receive script.

misterc-guest pushed a commit to branch master
in repository transdecoder.

commit e6b84af49bbe96efe66862be6f4fb1e3b024187d
Author: Michael R. Crusoe <mcrusoe at msu.edu>
Date:   Fri Feb 13 06:15:14 2015 -0500

    Imported Upstream version 2.0.1
---
 LICENSE.txt                                       |   17 +
 Makefile                                          |   13 +
 PerlLib/Fasta_reader.pm                           |  169 +
 PerlLib/GFF3_utils.pm                             |  256 +
 PerlLib/Gene_obj.pm                               | 5441 +++++++++++++++++++++
 PerlLib/Gene_obj_indexer.pm                       |   75 +
 PerlLib/Longest_orf.pm                            |  371 ++
 PerlLib/Nuc_translator.pm                         |  415 ++
 PerlLib/TiedHash.pm                               |  199 +
 README                                            |   27 +
 README.md                                         |   26 +
 Release.Notes                                     |   20 +
 TransDecoder.LongOrfs                             |  301 ++
 TransDecoder.Predict                              |  364 ++
 notes                                             |    1 +
 sample_data/blastp.results.outfmt6.gz             |  Bin 0 -> 1808 bytes
 sample_data/cleanme.pl                            |   42 +
 sample_data/pfam.domtblout.gz                     |  Bin 0 -> 12838 bytes
 sample_data/runMe.sh                              |   62 +
 sample_data/test.genome.fasta.gz                  |  Bin 0 -> 62361 bytes
 sample_data/test.tophat.sam.gz                    |  Bin 0 -> 9125143 bytes
 sample_data/transcripts.gtf.gz                    |  Bin 0 -> 4149 bytes
 transdecoder_plugins/Makefile                     |   17 +
 transdecoder_plugins/cd-hit-v4.6.1-2012-08-27.tgz |  Bin 0 -> 652425 bytes
 util/bin/.hidden                                  |    0
 util/cdna_alignment_orf_to_genome_orf.pl          |  327 ++
 util/compute_base_probs.pl                        |   67 +
 util/cufflinks_gtf_genome_to_cdna_fasta.pl        |  113 +
 util/cufflinks_gtf_to_alignment_gff3.pl           |   97 +
 util/cufflinks_gtf_to_bed.pl                      |   98 +
 util/ffindex_resume.pl                            |   51 +
 util/gene_list_to_gff.pl                          |   45 +
 util/get_top_longest_fasta_entries.pl             |   48 +
 util/gff3_file_to_bed.pl                          |   43 +
 util/gff3_file_to_proteins.pl                     |  164 +
 util/index_gff3_files_by_isoform.pl               |   74 +
 util/nr_ORFs_gff3.pl                              |   75 +
 util/pfam_mpi.pbs                                 |   27 +
 util/pfam_runner.pl                               |  309 ++
 util/remove_eclipsed_ORFs.pl                      |   92 +
 util/score_CDS_liklihood_all_6_frames.pl          |   90 +
 util/seq_n_baseprobs_to_logliklihood_vals.pl      |  172 +
 42 files changed, 9708 insertions(+)

diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..d9fe246
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,17 @@
+Copyright (c) 2012, The Broad Institute, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+·         Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+·         Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+·         Neither the name of the Broad Institute nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.**
+
+THIS SOFTWARE IS PROVIDED BY THE BROAD INSTITUTE  ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE BROAD INSTITUTE 
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e0ea1e1
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,13 @@
+SHELL := /bin/bash
+
+all:
+	cd ./transdecoder_plugins/ && $(MAKE) all
+
+clean:
+	cd ./transdecoder_plugins/ && $(MAKE) clean
+	cd ./sample_data && ./cleanme.pl
+
+test:
+	cd ./sample_data/ && ./runMe.sh
+
+
diff --git a/PerlLib/Fasta_reader.pm b/PerlLib/Fasta_reader.pm
new file mode 100755
index 0000000..c60d45e
--- /dev/null
+++ b/PerlLib/Fasta_reader.pm
@@ -0,0 +1,169 @@
+#!/usr/local/bin/perl -w
+
+# lightweight fasta reader capabilities:
+package Fasta_reader;
+
+use strict;
+
+sub new {
+    my ($packagename, $fastaFile) = @_;
+
+	## note: fastaFile can be a filename or an IO::Handle
+	
+
+    my $self = { fastaFile => undef,,
+				 fileHandle => undef };
+
+    bless ($self, $packagename);
+    
+    ## create filehandle
+    my $filehandle = undef;
+    
+	if (ref $fastaFile eq 'IO::Handle') {
+		$filehandle = $fastaFile;
+	}
+	else {
+		
+		open ($filehandle, $fastaFile) or die "Error: Couldn't open $fastaFile\n";
+		$self->{fastaFile} = $fastaFile;
+	}
+	
+	$self->{fileHandle} = $filehandle;
+
+    return ($self);
+}
+
+
+
+#### next() fetches next Sequence object.
+sub next {
+    my $self = shift;
+    my $orig_record_sep = $/;
+    $/="\n>";
+    my $filehandle = $self->{fileHandle};
+    my $next_text_input = <$filehandle>;
+    
+	if (defined($next_text_input) && $next_text_input !~ /\w/) {
+		## must have been some whitespace at start of fasta file, before first entry.
+		## try again:
+		$next_text_input = <$filehandle>;
+	}
+	
+	my $seqobj = undef;
+    
+	if ($next_text_input) {
+		$next_text_input =~ s/^>|>$//g; #remove trailing > char.
+		$next_text_input =~ tr/\t\n\000-\037\177-\377/\t\n/d; #remove cntrl chars
+		my ($header, @seqlines) = split (/\n/, $next_text_input);
+		my $sequence = join ("", @seqlines);
+		$sequence =~ s/\s//g;
+		
+		$seqobj = Sequence->new($header, $sequence);
+    }
+    
+    $/ = $orig_record_sep; #reset the record separator to original setting.
+    
+    return ($seqobj); #returns null if not instantiated.
+}
+
+
+#### finish() closes the open filehandle to the query database.
+sub finish {
+    my $self = shift;
+    my $filehandle = $self->{fileHandle};
+    close $filehandle;
+    $self->{fileHandle} = undef;
+}
+
+####
+sub retrieve_all_seqs_hash {
+	my $self = shift;
+
+	my %acc_to_seq;
+	
+	while (my $seq_obj = $self->next()) {
+		my $acc = $seq_obj->get_accession();
+		my $sequence = $seq_obj->get_sequence();
+
+		$acc_to_seq{$acc} = $sequence;
+	}
+
+	return(%acc_to_seq);
+}
+
+
+
+##############################################
+package Sequence;
+use strict;
+
+sub new {
+    my ($packagename, $header, $sequence) = @_;
+    
+    ## extract an accession from the header:
+    my ($acc, $rest) = split (/\s+/, $header, 2);
+        
+    my $self = { accession => $acc,
+		 header => $header,
+		 sequence => $sequence,
+		 filename => undef };
+    bless ($self, $packagename);
+    return ($self);
+}
+
+####
+sub get_accession {
+    my $self = shift;
+    return ($self->{accession});
+}
+
+####
+sub get_header {
+    my $self = shift;
+    return ($self->{header});
+}
+
+####
+sub get_sequence {
+    my $self = shift;
+    return ($self->{sequence});
+}
+
+#### 
+sub get_FASTA_format {
+    my $self = shift;
+    my $header = $self->get_header();
+    my $sequence = $self->get_sequence();
+    $sequence =~ s/(\S{60})/$1\n/g;
+    my $fasta_entry = ">$header\n$sequence\n";
+    return ($fasta_entry);
+}
+
+
+####
+sub write_fasta_file {
+    my $self = shift;
+    my $filename = shift;
+
+    my ($accession, $header, $sequence) = ($self->{accession}, $self->{header}, $self->{sequence});
+    
+	my $fasta_entry = $self->get_FASTA_format();
+	
+    my $tempfile;
+    if ($filename) {
+		$tempfile = $filename;
+    } else {
+		my $acc = $accession;
+		$acc =~ s/\W/_/g;
+		$tempfile = "$acc.fasta";
+    }
+    
+    open (TMP, ">$tempfile") or die "ERROR! Couldn't write a temporary file in current directory.\n";
+    print TMP $fasta_entry;
+    close TMP;
+    return ($tempfile);
+}
+
+1; #EOM
+
+
diff --git a/PerlLib/GFF3_utils.pm b/PerlLib/GFF3_utils.pm
new file mode 100644
index 0000000..01d7e7e
--- /dev/null
+++ b/PerlLib/GFF3_utils.pm
@@ -0,0 +1,256 @@
+#!/usr/local/bin/perl
+
+package main;
+our $SEE;
+
+
+package GFF3_utils;
+
+use strict;
+use warnings;
+use Gene_obj;
+use Gene_obj_indexer;
+use Carp;
+use URI::Escape;
+use Data::Dumper;
+
+
+####
+sub index_GFF3_gene_objs {
+    
+    my ($gff_filename, $gene_obj_indexer, $contig_id) = @_;
+    # contig_id is optional.
+    
+
+    my $hash_mode = 0;
+    if (ref $gene_obj_indexer eq 'HASH') {
+        $hash_mode = 1;
+    }
+    
+    ## note can use either a gene_obj_indexer or a hash reference.
+    
+    my %gene_coords;
+    my %asmbl_id_to_gene_id_list;
+    my %transcript_to_gene;
+    my %cds_phases;
+    
+    my %gene_names;
+	my %loci;
+
+    open (my $fh, $gff_filename) or die $!;
+
+    my %gene_id_to_source_type;
+
+    my %source_tracker;
+    
+    my $counter = 0;
+    # print STDERR "\n-parsing file $gff_filename\n";
+    while (<$fh>) {
+
+		chomp;
+        
+        unless (/\w/) { next;} # empty line
+        
+        if (/^\#/) { next; } # comment entry in gff3
+
+        my @x = split (/\t/);
+
+		unless (scalar @x >= 9) {
+			print STDERR "-ignoring line $_\n";
+			next;
+		}
+		
+        my ($asmbl_id, $source, $feat_type, $lend, $rend, $orient, $cds_phase, $gene_info) = ($x[0], $x[1], $x[2], $x[3], $x[4], $x[6], $x[7], $x[8]);    
+        
+        if ($contig_id && $asmbl_id ne $contig_id) { next; }
+
+        unless ($feat_type) { die "Error, $_, no feat_type: line\[$_\]"; }
+        
+        unless ($feat_type =~ /^(gene|mRNA|CDS|exon)$/) { next;} ## these are the only fields I care about right now.
+
+        $gene_info = uri_unescape($gene_info);
+        
+        $gene_info =~ /ID=([^;\s]+);?/;
+        my $id = $1 or die "Error, couldn't get the id field $_";
+        
+        if (exists $source_tracker{$id} && $source_tracker{$id} ne $source) {
+            confess "Error, gene ID $id is given source $source when previously encountered with source $source_tracker{$id} ";
+        }
+        
+        if ($feat_type eq 'gene') {
+            my $gene_name = "";
+            if ($gene_info =~ /Name=\"?([^\;\"]+)\"?/) {
+                $gene_name = $1;
+			}
+			else {
+				$gene_name = "";
+			}
+            
+            if ($gene_info =~ /Note=\"?([^\;\"]+)\"?/) {
+                $gene_name .= " $1";
+            }
+                        			
+            $gene_names{$id} = $gene_name;
+			
+		}
+        
+		if ($gene_info =~ /Alias=([^;]+)/) {
+			my $locus = $1;
+			$loci{$id} = $locus;
+		}
+		
+	 
+        if ($feat_type eq 'gene') { next;} ## beyond this pt, gene is not needed.
+        
+        $gene_info =~ /Parent=([^;\s]+);?/;
+        my $parent = $1 or die "Error, couldn't get the parent info $_";
+                
+        # print "id: $id, parent: $parent\n";
+        
+        if ($feat_type eq 'mRNA') {
+            ## just get the identifier info
+            $transcript_to_gene{$id} = $parent;
+            next;
+        }
+        
+        my $transcript_id = $parent;
+        my $gene_id = $transcript_to_gene{$transcript_id};
+		unless (defined $gene_id) {
+			print STDERR "Error, no gene feature found for $transcript_id.... ignoring feature.\n";
+			next;
+		}
+
+			        
+        $gene_id_to_source_type{$gene_id} = $source;
+
+        my ($end5, $end3) = ($orient eq '+') ? ($lend, $rend) : ($rend, $lend);
+        
+        $gene_coords{$asmbl_id}->{$gene_id}->{$transcript_id}->{$feat_type}->{$end5} = $end3;
+        # print "$asmbl_id, $gene_id, $transcript_id, $feat_type, $end5, $end3\n";
+        
+        if ($cds_phase =~ /^\d+$/) {
+            $cds_phases{$gene_id}->{$transcript_id}->{$end5} = $cds_phase;
+        }
+        
+    }
+    close $fh;
+    
+    ## 
+    # print STDERR "\n-caching genes.\n";
+    foreach my $asmbl_id (sort keys %gene_coords) {
+        my $genes_href = $gene_coords{$asmbl_id};
+        
+		foreach my $gene_id (keys %$genes_href) {
+            print STDERR "\r-indexing [$gene_id]  ";
+            my $transcripts_href = $genes_href->{$gene_id};
+            
+            my @gene_objs;
+            
+            foreach my $transcript_id (keys %$transcripts_href) {
+            
+                my $cds_coords_href = $transcripts_href->{$transcript_id}->{CDS} || {}; # could be a noncoding transcript w/ no CDS
+                my $exon_coords_href = $transcripts_href->{$transcript_id}->{exon};
+                
+                unless (ref $exon_coords_href) {
+                    print STDERR Dumper ($transcripts_href);
+                    die "Error, missing exon coords for $transcript_id, $gene_id\n";
+                }
+                
+                my $gene_obj = new Gene_obj();
+                
+                
+                if (scalar (keys %$cds_coords_href) == 1) {
+                    
+                    ## could be that only the cds span was provided. 
+                    ## break it up across the exon segments
+                    
+                    my ($cds_lend, $cds_rend) = sort {$a<=>$b} %$cds_coords_href;
+                    my @exon_coords;
+                    my $orient;
+                    foreach my $exon_end5 (keys %$exon_coords_href) {
+                        my $exon_end3 = $exon_coords_href->{$exon_end5};
+                        push (@exon_coords, [$exon_end5, $exon_end3]);
+                        if ($exon_end5 < $exon_end3) {
+                            $orient = '+';
+                        }
+                        elsif ($exon_end5 > $exon_end3) {
+                            $orient = '-';
+                        }
+                    }
+                    
+                    $gene_obj->build_gene_obj_exons_n_cds_range(\@exon_coords, $cds_lend, $cds_rend, $orient);
+                }
+                else {
+                    
+                    ## cds and exons specified separately
+                    
+                    $gene_obj->populate_gene_obj($cds_coords_href, $exon_coords_href);
+                }
+                
+                $gene_obj->{Model_feat_name} = $transcript_id;
+                $gene_obj->{TU_feat_name} = $gene_id;
+                $gene_obj->{asmbl_id} = $asmbl_id;
+        
+				if (my $gene_locus = $loci{$gene_id}) {
+					$gene_obj->{pub_locus} = $gene_locus;
+				}
+				if (my $transcript_locus = $loci{$transcript_id}) {
+					$gene_obj->{model_pub_locus} = $transcript_locus;
+				}
+				
+        
+                $gene_obj->{com_name} = $gene_names{$gene_id} || $transcript_id;
+        
+                $gene_obj->{source} = $gene_id_to_source_type{$gene_id};
+                
+                ## set CDS phase info if available from the gff
+                my $cds_phases_href = $cds_phases{$gene_id}->{$transcript_id};
+                if (ref $cds_phases_href) {
+                    ## set the cds phases
+                    my @exons = $gene_obj->get_exons();
+                    foreach my $exon (@exons) {
+                        if (my $cds = $exon->get_CDS_obj()) {
+                            my ($end5, $end3) = $cds->get_coords();
+                            my $phase = $cds_phases_href->{$end5};
+                            unless ($phase =~ /\d+/) {
+                                confess "Error, should have phase set for cds $gene_id $transcript_id $end5, but I do not. ";
+                            }
+                            $cds->set_phase($phase);
+                        }
+                    }
+                }
+                        
+                push (@gene_objs, $gene_obj);
+            }
+            
+            ## want single gene that includes all alt splice variants here
+            my $template_gene_obj = shift @gene_objs;
+            foreach my $other_gene_obj (@gene_objs) {
+                $template_gene_obj->add_isoform($other_gene_obj);
+            }
+            
+			$template_gene_obj->refine_gene_object();
+			
+            if ($hash_mode) {
+                $gene_obj_indexer->{$gene_id} = $template_gene_obj;
+            }
+            else {
+                $gene_obj_indexer->store_gene($gene_id, $template_gene_obj);
+            }
+            
+            print "GFF3_utils: stored $gene_id\n" if $SEE;
+            
+            # add to gene list for asmbl_id
+            my $gene_list_aref = $asmbl_id_to_gene_id_list{$asmbl_id};
+            unless (ref $gene_list_aref) {
+                $gene_list_aref = $asmbl_id_to_gene_id_list{$asmbl_id} = [];
+            }
+            push (@$gene_list_aref, $gene_id);
+        }
+    }
+    print STDERR "\n";
+    return (\%asmbl_id_to_gene_id_list);
+}
+
+
+1; #EOM
diff --git a/PerlLib/Gene_obj.pm b/PerlLib/Gene_obj.pm
new file mode 100755
index 0000000..990011c
--- /dev/null
+++ b/PerlLib/Gene_obj.pm
@@ -0,0 +1,5441 @@
+#!/usr/bin/env perl
+
+package main;
+our $DEBUG;
+
+package Gene_obj;
+use strict;
+use Nuc_translator;
+#use Gene_ontology;
+use Longest_orf;
+use Storable qw (store retrieve freeze thaw dclone);
+use warnings;
+use Data::Dumper;
+use Carp qw (croak cluck confess);
+use URI::Escape;
+
+=head1 NAME
+
+package Gene_obj
+
+=cut
+
+
+
+=head1 DESCRIPTION
+
+    Gene_obj(s) encapsulate the elements of both gene structure and gene function. The gene structure is stored in a hierarchical fashion as follows:
+
+    Gene  =========================================================
+
+    Exon  =========     =========         =========        ========
+
+    CDS      ======     =========         ======
+
+   
+    where a Gene is a container for Exon(s), and each Exon is a container for a CDS, and an Exon can contain a single CDS component.  An Exon lacking a CDS exon is an untranslated exon or UTR exon.  The region of an Exon which extends beyond the CDS is also considered a UTR.
+  
+    
+    There are several ways to instantiate gene objects.  A simple example is described:
+
+    Exon and CDS component coordinates can be assigned as hashes.
+
+    ie. 
+    
+    my %mrna = ( 100 => 200,
+	         300 => 500 );
+
+    my %CDS = ( 150=>200,
+		300=>450);
+
+    my $sequence = "GACTACATTTAATAGGGCCC"; #string representing the genomic sequence
+    my $gene = new Gene_obj();
+    
+    $gene->{com_name} = "hypothetical protein";
+
+    $gene->populate_gene_obj(\%CDS, \%mRNA, \$sequence);
+    print $gene->toString();
+
+    
+    
+    Alternatively, the individual components of genes (Exons and CDSs) can be instantiated separately and used to build the Gene from the ground up (See packages mRNA_exon_obj and CDS_exon_obj following this Gene_obj documentation).
+    
+    my $cds_exon = new CDS_exon_obj (150, 200);
+    
+    my $mRNA_exon = new mRNA_exon_obj (100, 200);
+   
+    $mRNA_exon->set_CDS_exon_obj($cds_exon);
+
+    my $gene_obj = new Gene_obj ();
+
+    $gene_obj->{gene_name} = "hypothetical gene";
+    $gene_obj->{com_name} = "hypothetical protein";
+  
+    $gene_obj->add_mRNA_exon_obj($mRNA_exon);
+
+    $gene_obj->refine_gene_object();
+
+    $gene_obj->create_all_sequence_types (\$sequence);  #ref to genomic sequence string.    
+
+    print $gene_obj->toString();
+
+
+    The API below describes useful functions for navigating and manipulating the Gene object along with all of its attributes.
+    
+
+
+=cut
+
+
+
+
+
+
+=over 4
+
+=item new()
+
+B<Description:> Constructor for Gene_obj 
+
+B<Parameters:> none
+
+B<Returns:> $gene_obj
+
+
+The Gene_obj contains several attributes which can be manipulated directly (or by get/set methods if they exist).  These attributes include:
+
+    asmbl_id # identifier for the genomic contig for which this gene is anchored.
+    TU_feat_name #feat_names are TIGR temporary identifiers.
+    Model_feat_name # temp TIGR identifier for gene models
+    locus  #identifier for a gene (TU) ie. T2P3.5
+    pub_locus  #another identifier for a gene (TU)   ie. At2g00010
+    model_pub_locus #identifier for a gene model (model)  ie. At2g00010.1
+    model_locus #analagous to locus, but for model rather than gene (TU)
+    alt_locus   #alternative locus 
+    gene_name # name for gene
+    com_name  # name for gene product 
+    comment #internal comment
+    pub_comment #comment related to gene
+    ec_num   # enzyme commission number
+    gene_sym  # gene symbol
+    is_5prime_partial # 0|1  missing start codon.
+    is_3prime_partial # 0|1  missing stop codon.
+    is_pseudogene # 0|1
+    curated_com_name # 0|1
+    curated_gene_structure # 0|1
+    
+    ## Other attributes set internally  Access-only, do not set directly.
+        
+    gene_length  # length of gene span (int).
+    mid_pt  # holds midpoint of gene-span
+    strand  # [+-]
+    protein_seq # holds protein sequence
+    protein_seq_length
+    CDS_sequence  #holds CDS sequence (translated to protein); based on CDS_exon coordinates
+    CDS_seq_length 
+    cDNA_sequence  #holds cDNA sequence; based on mRNA exon coordinates.
+    cDNA_seq_length 
+    gene_sequence #holds unspliced transcript
+    gene_sequence_length #length of unspliced transcript
+    gene_type # "protein-coding", #default type for gene object.  Could be changed to "rRNA|snoRNA|snRNA|tRNA" to accommodate other gene or feature types.
+    num_additional_isoforms # int 
+    
+    
+=back
+
+=cut
+
+
+
+sub new {
+    shift;
+    my $self = { asmbl_id => 0, #genomic contig ID
+                 locus => undef,       #text
+                 pub_locus => undef,   #text  ie. At2g00010
+                 model_pub_locus =>undef, #text ie. At2g00010.1
+                 model_locus => undef, #text ie. F12G15.1
+                 alt_locus => undef,   #text
+                 gene_name => undef, #text
+                 com_name => undef,    #text
+                 comment => undef,
+                 curated_com_name => 0,
+                 curated_gene_structure => 0,
+                 pub_comment => undef, #text
+                 ec_num => undef, #text (enzyme commission number)
+                 gene_type => "protein-coding", #default type for gene object.  Could be changed to "rRNA|snoRNA|snRNA|tRNA" to accomodate other gene or feature types.
+                 gene_sym => undef, #text (gene symbol)
+                 mRNA_coords => 0, #assigned to anonymous hash of end5->end3 relative to the parent sequence
+                 CDS_coords => 0,  #assigned to anonymous hash of end5->end3 relative to the parent sequence
+                 mRNA_exon_objs => 0,  # holds arrayref to mRNA_obj, retrieve only thru method: get_exons()
+                 num_exons => 0,      # number of exons in this gene_obj
+                 model_span => [],     # holds array ref to (end5,end3) for CDS range of gene.
+                 gene_span => [],      # holds array ref to (end5,end3) for mRNA range of gene.
+                 gene_length => 0,     # length of gene span (int).
+                 mid_pt => 0,         # holds midpoint of gene-span
+                 strand => 0,      # [+-]
+                 gi => undef,          #text
+                 prot_acc => undef,     #text
+                 is_pseudogene => 0, # toggle indicating pseudogene if 1.
+                 is_5prime_partial => 0, #boolean indicating missing 5' part of gene.
+                 is_3prime_partial => 0, #boolean
+                 protein_seq => undef,    # holds protein sequence
+                 protein_seq_length => 0,
+                 CDS_sequence => undef,    #holds CDS sequence (translated to protein); based on CDS_exon coordinates
+                 CDS_seq_length => 0,
+                 cDNA_sequence => undef,   #holds cDNA sequence; based on mRNA exon coordinates.
+                 cDNA_seq_length => 0,
+                 gene_sequence => undef, #holds unspliced transcript
+                 gene_sequence_length => 0, #length of unspliced transcript
+                 TU_feat_name => undef,    #feat_names are TIGR temporary identifiers.
+                 Model_feat_name =>undef,
+                 classification => 'annotated_genes', #type of seq_element.
+                 gene_synonyms => [],    #list of synonymous model feat_names
+                 GeneOntology=>[], #list of Gene_ontology assignment objects.  ...see GeneOntology.pm
+                 
+                 ## Additional functional attributes:
+                 secondary_gene_names => [],
+                 secondary_product_names => [],
+                 secondary_gene_symbols => [],
+                 secondary_ec_numbers =>[],
+                 
+                 
+                 ## Alternative splicing support.  
+                 num_additional_isoforms => 0,  # number of additional isoforms stored in additonal_isoform list below
+                 additional_isoforms => [] # stores list of Gene_objs corresponding to the additional isoforms.
+		     
+                 };
+    bless($self);
+    return ($self);
+}
+
+
+
+
+=over 4
+
+=item erase_gene_structure()
+
+B<Description:> Removes the structural components of a gene (ie. exons, CDSs, coordinate spans, any corresponding sequences)
+
+B<Parameters:> none
+
+B<Returns:> none 
+
+=back
+
+=cut
+
+
+## erase gene structure
+sub erase_gene_structure {
+    my $self = shift;
+    $self->{mRNA_exon_objs} = 0;
+    $self->{num_exons} = 0;
+    $self->{model_span} = [];
+    $self->{gene_span} = [];
+    $self->{gene_length} = 0;
+    $self->{strand} = 0;
+    $self->{protein_seq} = 0;
+    $self->{CDS_sequence} = 0;
+    $self->{CDS_seq_length} = 0;
+    $self->{cDNA_sequence} = 0;
+    $self->{cDNA_seq_length} = 0;
+}
+
+
+=over 4
+
+=item clone_gene()
+
+B<Description:> Clones this Gene_obj by copying attributes from this Gene to a new gene.  Does NOT do a deep clone for all attributes.  See dclone() for a more rigorous cloning method.  This method is safer because all references are not cloned, only the critical ones.
+
+B<Parameters:> none
+
+B<Returns:> new Gene_obj
+
+=back
+
+=cut
+
+
+
+## all objects are cloned.  References to data only are not.
+sub clone_gene {
+    my $self = shift;
+    my $clone = new Gene_obj();
+    
+    
+    ## Copy over the non-ref attribute values.
+    foreach my $key (keys %$self) {
+        my $value = $self->{$key};
+        if (defined $value) {
+            ## Not copying over refs.
+            if (ref $value) {
+                next;
+            }
+            
+            ## Not copying over attributes of length > 200, such as protein/nucleotide sequences
+            my $length = length($value);
+            if ($length > 200) { next;}
+        }
+        
+        # passed tests above, copying attribute.
+        $clone->{$key} = $value;
+        
+    }
+    
+    ## copy over the gene synonyms.
+    my @gene_syns = @{$self->{gene_synonyms}};
+    $clone->{gene_synonyms} = \@gene_syns;
+    
+    
+    ## copy the GO assignments:
+    my @GO_assignments = $self->get_gene_ontology_objs();
+    if (@GO_assignments) {
+        foreach my $go_assignment (@GO_assignments) {
+            my $go_clone = dclone($go_assignment);
+            $clone->add_gene_ontology_objs($go_clone);
+        }
+    }
+    
+    
+    ## copy gene structure.
+    my @exons = $self->get_exons();
+    foreach my $exon (@exons) {
+        $clone->add_mRNA_exon_obj($exon->clone_exon());
+    }
+    
+    foreach my $isoform ($self->get_additional_isoforms()) {
+        my $isoform_clone = $isoform->clone_gene();
+        $clone->add_isoform($isoform_clone);
+    }
+    
+    $clone->refine_gene_object();
+    
+    return ($clone);
+}
+
+
+
+
+=over 4
+
+=item deep_clone()
+
+B<Description:> Provides a deep clone of a gene_obj.  Only references supported in Gene_obj documentation are supported.  Those added in a rogue way are undef()d
+
+B<Parameters:> none
+
+B<Returns:> $gene_obj
+
+uses the Storable dclone() function to deep clone the Gene_obj
+
+=back
+
+=cut
+
+
+    ;
+## all objects are cloned.  References to data only are not.
+sub deep_clone {
+    my $self = shift;
+    my $clone = dclone($self);
+    
+    my %supported_refs = (model_span => 1,
+                          gene_span => 1,
+                          gene_synonyms => 1,
+                          Gene_ontology => 1,
+                          additional_isoforms=>1,
+                          mRNA_exon_objs => 1);
+    
+    foreach my $gene_obj ($clone, $clone->get_additional_isoforms()) {
+        
+        my @keys = keys %$gene_obj;
+        foreach my $key (@keys) {
+            my $value = $gene_obj->{$key};
+            if (ref $value && !$supported_refs{$key}) {
+                $gene_obj->{$key} = undef;
+            }
+        }
+    }
+    
+    return ($clone);
+}
+
+
+=over 4
+
+=item populate_gene_obj()
+
+B<Description:> Given CDS and mRNA coordinates stored in hash form, a gene object is populated with mRNA and CDS exons.  This is one available way to populate a newly instantiated Gene_obj.
+
+B<Parameters:> $cds_hash_ref, $mRNA_hash_ref, <$seq_ref>
+
+$mRNA_hash_ref is a reference to a hash holding the end5 => end3 coordinates of the Exons
+
+$cds_hash_ref same as mRNA_has_ref except holds the CDS end5 => end3 coordinates.
+
+$seq_ref is a reference to a string containing the genomic sequence.  This is an optional parameter.
+
+
+B<Returns:> none
+
+=back
+
+=cut
+
+    ;
+
+## Do several things at once: assign CDS and mRNA coordinates, and build gene sequences.
+## The \$seq_ref is optional in case you want to create the sequence types.
+sub populate_gene_obj {
+    my ($self, $cds_ref, $mRNA_ref, $seq_ref) = @_;
+    $self->set_CDS_coords ($cds_ref);
+    $self->set_mRNA_coords ($mRNA_ref);
+    $self->refine_gene_object();
+    if (ref $seq_ref) {
+        $self->create_all_sequence_types($seq_ref);
+    }
+    ## reinitialize the hashrefs:
+    $self->{mRNA_coords} = 0;
+    $self->{CDS_coords} = 0;
+    
+    
+}
+
+
+# alias above
+sub populate_gene_object {
+    my $self = shift;
+    $self->populate_gene_obj(@_);
+}
+
+
+####
+sub populate_gene_object_via_CDS_coords {
+	my $self = shift;
+	my @coordsets = @_;
+
+	foreach my $coordset (@coordsets) {
+		my ($end5, $end3) = @$coordset;
+		my $mrna_exon_obj = mRNA_exon_obj->new($end5, $end3);
+		my $cds_obj = CDS_exon_obj->new($end5, $end3);
+		$mrna_exon_obj->{CDS_exon_obj} = $cds_obj;
+		$self->add_mRNA_exon_obj($mrna_exon_obj);
+	}
+	
+	$self->refine_gene_object();
+	return;
+}
+
+
+sub build_gene_obj_exons_n_cds_range {
+	my $self = shift;
+	my ($exons_aref, $cds_lend, $cds_rend, $orient) = @_;
+
+	my @exon_coords;
+	foreach my $exon_aref (@$exons_aref) {
+		my ($exon_lend, $exon_rend) = sort {$a<=>$b} @$exon_aref;
+		push (@exon_coords, [$exon_lend, $exon_rend] );
+	}
+	@exon_coords = sort {$a->[0]<=>$b->[0]} @exon_coords;
+
+
+	unless ($orient =~ /^[\+\-]$/) {
+		confess "Error, orient not [+-] ";
+	}
+
+	## build the CDS coordinates.
+
+	my @cds_range;
+
+    if ($cds_lend > 0 && $cds_rend > 0) {
+        
+        ($cds_lend, $cds_rend) = sort {$a<=>$b} ($cds_lend, $cds_rend);
+        
+        foreach my $exon_coords_aref (@exon_coords) {
+            my ($exon_lend, $exon_rend) = @$exon_coords_aref;
+            
+            if ($exon_rend >= $cds_lend && $exon_lend <= $cds_rend) {
+                
+                ## got overlap
+                my $cds_exon_lend = ($cds_lend < $exon_lend) ? $exon_lend : $cds_lend;
+                
+                my $cds_exon_rend = ($cds_rend > $exon_rend) ? $exon_rend : $cds_rend;
+                
+                push (@cds_range, [$cds_exon_lend, $cds_exon_rend]);
+            }
+        }
+        
+        unless (@cds_range) {
+            confess "Error, no CDS exon coords built based on exon overlap";
+        }
+    }
+	## all coordinate sets are ordered left to right.
+	# build the coordinates href
+	
+	my %exon_coords;
+	my %cds_coords;
+	foreach my $exon_coords_aref (@exon_coords) {
+		my ($exon_lend, $exon_rend) = @$exon_coords_aref;
+		my ($exon_end5, $exon_end3) = ($orient eq '+') ? ($exon_lend, $exon_rend) : ($exon_rend, $exon_lend);
+		$exon_coords{$exon_end5} = $exon_end3;
+	}
+	foreach my $cds_coords_aref (@cds_range) {
+		my ($cds_lend, $cds_rend) = @$cds_coords_aref;
+		my ($cds_end5, $cds_end3) = ($orient eq '+') ? ($cds_lend, $cds_rend) : ($cds_rend, $cds_lend);
+		$cds_coords{$cds_end5} = $cds_end3;
+	}
+
+	# print Dumper (\%cds_coords) . Dumper (\%exon_coords);
+	
+	$self->populate_gene_obj(\%cds_coords, \%exon_coords);
+
+	return ($self);
+}
+
+
+####
+sub join_adjacent_exons {
+    my $self = shift;
+
+    my @exons = $self->get_exons();
+    
+    my $strand = $self->get_orientation();
+    
+    my $first_exon = shift @exons;
+    my @new_exons = ($first_exon);
+
+    while (@exons) {
+        my $prev_exon = $new_exons[$#new_exons];
+        my ($prev_end5, $prev_end3) = $prev_exon->get_coords();
+
+        my $next_exon = shift @exons;
+        my ($next_end5, $next_end3) = $next_exon->get_coords();
+
+        if ( ($strand eq '+' && $prev_end3 == $next_end5 - 1)  # adjacent
+             ||
+             ($strand eq '-' && $prev_end3 == $next_end5 + 1) ) {
+            
+            $prev_exon->merge_exon($next_exon);
+        }
+        else {
+            push (@new_exons, $next_exon);
+        }
+    }
+    
+    $self->{mRNA_exon_objs} = [@new_exons];
+    
+    $self->refine_gene_object();
+
+    return;
+}
+
+
+
+
+=over 4
+
+=item AAToNucleotideCoords()
+
+B<Description:> Converts an amino acid -based coordinate to a genomic sequence -based coordinate.
+
+B<Parameters:> $aa_coord
+
+B<Returns:> $genomic_coord
+
+undef is returned if the aa_coord could not be converted.
+
+
+=back
+
+=cut
+
+    ;
+
+sub AAToNucleotideCoords{
+    my($self) = shift;
+    my($aacoord) = shift;
+    my($debug) = shift;
+    my($PCDS_coords) = {};
+    my($A2NMapping) = {};
+    my($currAA) = 1; 
+    my $strand = $self->{strand};
+    my @exons = $self->get_exons();
+    my($cds_count)=0;
+    my($translated_bp)=-1;
+    my($lastcarryover)=0; 
+    my($end_bp);
+    foreach my $exon (sort {
+        if($strand eq "+"){
+            $a->{end5}<=>$b->{end5};
+        }
+        else{
+            $b->{end5}<=>$a->{end5};
+        }
+    } @exons) {
+        my $cds = $exon->get_CDS_obj();
+        if ($cds) {
+            my @cds_coords = $cds->get_CDS_end5_end3();
+            my($bpspread) = abs($cds_coords[0]-$cds_coords[1]);
+            $bpspread+=$lastcarryover;
+            my($nextAA) = int($bpspread/3); # last complete AA in CDS
+            $lastcarryover = $bpspread%3;
+            $PCDS_coords->{$currAA} = $currAA+$nextAA-1;
+            if($strand eq "+"){
+                $A2NMapping->{$currAA} = $cds_coords[0]<$cds_coords[1]?$cds_coords[0]:$cds_coords[1];
+            }
+            else{
+                $A2NMapping->{$currAA} = $cds_coords[0]<$cds_coords[1]?$cds_coords[1]:$cds_coords[0];
+            }
+            print "DEBUG: $strand $cds_count AA range ($currAA - $PCDS_coords->{$currAA}) nucleotide start($A2NMapping->{$currAA})\n" if($debug);
+            $currAA = $currAA+$nextAA;
+            $cds_count++;
+            if($strand eq "+"){
+                $end_bp = $cds_coords[0]<$cds_coords[1]?$cds_coords[1]:$cds_coords[0];
+            }
+            else{
+                $end_bp = $cds_coords[0]<$cds_coords[1]?$cds_coords[0]:$cds_coords[1];
+            }
+        }
+    }
+    # PCDS_coords key/value are start/stop aa counts for each cds;
+    # A2NMapping stores cds AA start key to cds nucleotide start
+    $cds_count=0;
+    foreach my $PCDS_end5 (sort {
+        $a<=>$b;
+	}(keys %$PCDS_coords)) {
+        my($PCDS_end3) = $PCDS_coords->{$PCDS_end5};
+	    if($aacoord>=$PCDS_end5 && $aacoord<=$PCDS_end3){
+            my($nucleotide_start) = $A2NMapping->{$PCDS_end5}; 
+            my($aa_offset) = $aacoord - $PCDS_end5;
+            my($nucleotide_offset) = $aa_offset*3;
+            print "DEBUG: CDS offset $aa_offset AA $nucleotide_offset bp\n" if($debug);
+            if($strand eq "+"){
+                $translated_bp = $nucleotide_start+$nucleotide_offset;
+            }
+            else{
+                $translated_bp = $nucleotide_start-$nucleotide_offset;
+            }
+            print "DEBUG: Mapping $aacoord to $translated_bp in cds $cds_count\n" if($debug);
+            print "DEBUG: CDS $PCDS_end5 - $PCDS_end3 nucleotide start $A2NMapping->{$PCDS_end5}, nuc offset $nucleotide_offset\n" if($debug); 
+	    }
+        
+        $cds_count++;
+	}
+    #}
+    if($translated_bp == -1){
+        $translated_bp = undef;
+        print STDERR "Unable to translate AA coordinate: $aacoord. Off end. Using undef\n" if($debug);
+    }
+    return $translated_bp;
+}
+
+
+
+## private method, used by populate_gene_obj()
+# sets CDS_coords instance member to a hash reference of CDS coordinates.   $hash{end5} = end3
+sub set_CDS_coords {
+    my $self = shift;
+    my $hash_ref = shift;
+    if (ref ($hash_ref) eq 'HASH') {
+        $self->{CDS_coords} = $hash_ref;
+    } else {
+        print STDERR "Cannot set CDS_coords, must have hash reference\n";
+    }
+}
+
+
+
+
+=over 4
+
+=item get_gene_span()
+
+B<Description:> Retrieves the coordinates which span the length of the gene along the genomic sequence.
+
+B<Parameters:> none
+
+B<Returns:> (end5, end3)
+
+These coordinates represent the minimal and maximal exonic coordinates of the gene.  Orientation can be inferred by the relative values of end5 and end3.
+
+
+=back
+
+=cut
+
+    ;
+
+## All return gene end5, end3 ###
+sub get_gene_span {
+    my $self = shift;
+    return (@{$self->{gene_span}});
+}
+
+
+
+
+## private
+sub get_seq_span {
+    my $self = shift;
+    return ($self->get_gene_span());
+}
+
+
+
+=over 4
+
+=item get_coords()
+
+B<Description:> See get_gene_span()
+
+B<Parameters:> none
+
+B<Returns:> (end5, end3)
+
+=back
+
+=cut
+
+
+sub get_coords {
+    my $self = shift;
+    return ($self->get_gene_span());
+}
+
+
+
+=over 4
+
+=item get_model_span()
+
+B<Description:> Retrieves the coordinates spanned by the protein-coding region of the gene along the genomic sequence.
+
+B<Parameters:> none
+
+B<Returns:> (end5, end3)
+
+These coordinates are determined by the min and max of the CDS components of the gene.
+
+=back
+
+=cut
+
+
+
+
+sub get_model_span {
+    my $self = shift;
+    return (@{$self->{model_span}});
+}  
+
+
+sub get_CDS_span { # preferred
+	my $self = shift;
+	return($self->get_model_span());
+}
+
+
+=over 4
+
+=item get_transcript_span()
+
+B<Description:> Retrieves the coordinates spanned by the exonic regions of the gene along the genomic sequence.
+
+B<Parameters:> none
+
+B<Returns:> (lend, rend)
+
+These coordinates are determined by the min and max of the CDS components of the gene.
+
+=back
+
+=cut
+
+
+sub get_transcript_span {
+	my $self = shift;
+	
+	my @coords;
+	my @exons = $self->get_exons();
+	foreach my $exon (@exons) {
+		push (@coords, $exon->get_coords());
+	}
+	@coords = sort {$a<=>$b} @coords;
+
+	my $lend = shift @coords;
+	my $rend = pop @coords;
+
+	return($lend, $rend);
+}
+
+
+sub is_pseudogene {
+    my $self = shift;
+    return ($self->{is_pseudogene});
+}
+
+sub set_pseudogene {
+    my $self = shift;
+    my $pseudogene_val = shift;
+    unless ($pseudogene_val =~ /[01]/) {
+        confess "Error, can set pseudogene to zero or one only.\n";
+    }
+
+    foreach my $gene ($self, $self->get_additional_isoforms()) {
+        $gene->{is_pseudogene} = $pseudogene_val;
+    }
+
+    return;
+}
+
+
+
+#private
+# sets mRNA_coords instance member to a hash reference of CDS coordinates.   $hash{end5} = end3
+sub set_mRNA_coords {
+    my $self = shift;
+    my $hash_ref = shift;
+    if (ref ($hash_ref) eq 'HASH') {
+        $self->{mRNA_coords} = $hash_ref;
+    } else {
+        print STDERR "Cannot set CDS_coords, must have hash reference\n";
+    }
+}
+
+
+=over 4
+
+=item refine_gene_object()
+
+B<Description:> This method performs some data management operations and should be called at any time modifications have been made to the gene structure (ie. exons added or modified, model isoforms added, etc).  It performs the following orientations:
+
+    -Sets (or resets)  gene span and model span coordinates, strand orientation, gene length, mid-point.
+
+B<Parameters:> none
+
+B<Returns:> none
+
+=back
+
+=cut
+
+## Once mRNA_coords and CDS_coords have been assigned, this will populate the remaining elements in the gene object.
+
+sub refine_gene_object {
+    my ($self) = shift;
+    #check to see if mRNA_coords field is populated.  If not, initialize.
+    if ($self->{mRNA_coords} == 0) {
+        $self->{mRNA_coords} = {};
+    }
+    my ($CDS_coords, $mRNA_coords) = ($self->{CDS_coords},  $self->{mRNA_coords});
+    
+    unless ($CDS_coords && $mRNA_coords) {
+        #maybe created exon objects already
+        if ($self->{mRNA_exon_objs}) {
+            $self->trivial_refinement();
+        }
+        return;
+    }
+    # intialize mRNA_exon_objs to array ref.
+    $self->{mRNA_exon_objs} = [];
+    #retrieve coordinate data.
+    my %mRNA = %$mRNA_coords;
+    my %CDS = %$CDS_coords;
+    my @mRNAcoords = keys %mRNA;
+    my @CDScoords = keys %CDS;
+    my (%new_mRNA, %new_CDS);
+    ## if correlation between mRNA exons and CDS exons, then map CDS's to mRNA's, otherwise, replicate CDSs as mRNAs
+    if ($#mRNAcoords >= $#CDScoords) {
+        
+        foreach my $mRNA_end5 (keys %mRNA) {
+            my $mRNA_end3 = $mRNA{$mRNA_end5};
+            #find overlapping cds exon to mRNA exon
+            #easy to compare if in same orientation for all comparisons
+            my ($m1, $m2) = ($mRNA_end5 < $mRNA_end3) ? ($mRNA_end5, $mRNA_end3) : ($mRNA_end3, $mRNA_end5);
+            #create mRNA_exon_obj
+            my $mRNA_exon_obj = mRNA_exon_obj->new ($mRNA_end5, $mRNA_end3);
+            $new_mRNA{$mRNA_end5} = $mRNA_end3;
+            foreach my $CDS_end5 (keys %CDS) {
+                my $CDS_end3 = $CDS{$CDS_end5};
+                my ($c1, $c2) = ($CDS_end5 < $CDS_end3) ? ($CDS_end5, $CDS_end3) : ($CDS_end3, $CDS_end5);
+                ## do overlap comparison; CDS must be contained within mRNA exon
+                if ( ($c1 >= $m1) && ($c2 <= $m2)) {
+                    # found the contained CDS
+                    $mRNA_exon_obj->{CDS_exon_obj} = CDS_exon_obj->new ($CDS_end5, $CDS_end3); 
+                    $new_CDS{$CDS_end5} = $CDS_end3;
+                    last;
+                }
+            }
+            $self->add_mRNA_exon_obj($mRNA_exon_obj);
+        }
+    } else { # remap CDSs to mRNAS
+        print STDERR "ERROR: mRNA exons < CDS exons.  Copying all CDS exons into mRNA exons. \n\n";
+        foreach my $CDS_end5 (keys %CDS) {
+            my $CDS_end3 = $CDS{$CDS_end5};
+            my $mRNA_exon_obj = mRNA_exon_obj->new ($CDS_end5, $CDS_end3);
+            $mRNA_exon_obj->{CDS_exon_obj} = CDS_exon_obj->new ($CDS_end5, $CDS_end3); 
+            $self->add_mRNA_exon_obj($mRNA_exon_obj);
+            $new_mRNA{$CDS_end5} = $CDS_end3;
+            $new_CDS{$CDS_end5} = $CDS_end3;
+        }
+    } 
+    
+    $self->trivial_refinement();
+    
+    ## assign orientation to all children exon and CDS components.
+    my $strand = $self->get_orientation();
+    foreach my $exon ($self->get_exons()) {
+        $exon->{strand} = $strand;
+        if (my $cds = $exon->get_CDS_exon_obj()) {
+            $cds->{strand} = $strand;
+        }
+    }
+    return;
+    
+}
+
+
+## alias
+sub refine_gene_obj {
+    my $self = shift;
+    $self->refine_gene_object();
+}
+
+    
+=over 4
+
+=item get_exons()
+
+B<Description:>Retrieves a list of exons belonging to this Gene_obj 
+
+B<Parameters:> none
+
+B<Returns:> @exons
+
+ at exons is an ordered list of mRNA_exon_obj; the first exon of the list corresponds to the first exon of the spliced gene.
+
+=back
+
+=cut
+
+    ;
+
+sub get_exons {
+    my ($self) = shift;
+    if ($self->{mRNA_exon_objs} != 0) {
+        my @exons = (@{$self->{mRNA_exon_objs}});
+        @exons = sort {$a->{end5}<=>$b->{end5}} @exons;
+        if ($self->{strand} eq '-') {
+            @exons = reverse (@exons);
+        }
+        return (@exons);
+    } else {
+        my @x = ();
+        return (@x); #empty array 
+    }
+}
+
+
+## private
+sub get_segments {
+    my $self = shift;
+    return ($self->get_exons());
+}
+
+
+
+=over 4
+
+=item number_of_exons()
+
+B<Description:> Provides the number of exons contained by the Gene
+
+B<Parameters:> none
+
+B<Returns:> int
+
+=back
+
+=cut
+
+
+
+sub number_of_exons {
+    my $self = shift;
+    my $exon_number = $#{$self->{mRNA_exon_objs}} + 1;
+    return ($exon_number);
+}
+
+
+
+
+
+
+
+=over 4
+
+=item get_intron_coordinates()
+
+B<Description:> Provides an ordered list of intron coordinates
+
+B<Parameters:> none
+
+B<Returns:> ( [end5,end3], ....) 
+
+A list of arrayRefs are returned providing the coordinates of introns, ordered from first intron to last intron within the gene.
+
+=back
+
+=cut
+
+    ;
+
+sub get_intron_coordinates {
+    my $gene_obj = shift;
+    my $strand = $gene_obj->get_orientation();
+    my @exons = $gene_obj->get_exons();
+    ## exon list should already be sorted.
+    my @introns = ();
+    
+    my $num_exons = $#exons + 1;
+    if ($num_exons > 1) { #only genes with multiple exons will have introns.
+        if ($strand eq '+') {
+            my $first_exon = shift @exons;
+            while (@exons) {
+                my $next_exon = shift @exons;
+                my ($first_end5, $first_end3) = $first_exon->get_coords();
+                my ($next_end5, $next_end3) = $next_exon->get_coords();
+                my $intron_end5 = $first_end3 + 1;
+                my $intron_end3 = $next_end5 -1;
+                if ($intron_end5 < $intron_end3) {
+                    push (@introns, [$intron_end5, $intron_end3]);
+                }
+                $first_exon = $next_exon;
+            }
+        } elsif ($strand eq '-') {
+            my $first_exon = shift @exons;
+            while (@exons) {
+                my $next_exon = shift @exons;
+                my ($first_end5, $first_end3) = $first_exon->get_coords();
+                my ($next_end5, $next_end3) = $next_exon->get_coords();
+                my $intron_end5 = $first_end3 - 1;
+                my $intron_end3 = $next_end5 +1;
+                if ($intron_end5 > $intron_end3) {
+                    push (@introns, [$intron_end5, $intron_end3]);
+                }
+                $first_exon = $next_exon;
+            }
+            
+        } else {
+            die "Strand for gene_obj is not specified." . $gene_obj->toString();
+        }
+    }
+    return (@introns);
+}
+
+
+
+
+
+#private
+sub trivial_refinement {
+    my $self = shift;
+    my @exons = $self->get_exons();
+    $self->{num_exons} = scalar(@exons);
+    my (%mRNAexons, %CDSexons);
+    foreach my $exon (@exons) {
+        my ($exon_end5, $exon_end3) = $exon->get_mRNA_exon_end5_end3();
+        $mRNAexons{$exon_end5} = $exon_end3;
+        my $cds;
+        if ($cds = $exon->get_CDS_obj()) {
+            my ($cds_end5, $cds_end3) = $cds->get_CDS_end5_end3();
+            $CDSexons{$cds_end5} = $cds_end3;
+        }
+    }
+    my @mRNAexonsEnd5s = sort {$a<=>$b} keys %mRNAexons;
+    my @CDSexonsEnd5s = sort {$a<=>$b} keys %CDSexons;
+    my $strand = 0; #initialize.
+    foreach my $mRNAend5 (@mRNAexonsEnd5s) {
+        my $mRNAend3 = $mRNAexons{$mRNAend5};
+        if ($mRNAend5 == $mRNAend3) {next;}
+        $strand = ($mRNAend5 < $mRNAend3) ? '+':'-';
+        last;
+    }
+    $self->{strand} = $strand;
+    
+    ## determine gene and model boundaries:
+    my ($gene_end5, $gene_end3, $model_end5, $model_end3);
+    my @gene_coords = sort {$a<=>$b} %mRNAexons;
+    my @model_coords = sort {$a<=>$b} %CDSexons;
+    my $gene_lend = shift @gene_coords;
+    my $gene_rend = pop @gene_coords;
+    ## bound gene by transcript span
+    ($gene_end5, $gene_end3) = ($strand eq "+") ? ($gene_lend, $gene_rend) : ($gene_rend, $gene_lend);
+    if (@model_coords) {
+        ## bound model by protein coding span
+        my $model_lend = shift @model_coords;
+        my $model_rend = pop @model_coords;
+        ($model_end5, $model_end3) = ($strand eq "+") ? ($model_lend, $model_rend) : ($model_rend, $model_lend);
+    } else {
+        ## give it gene boundaries instead:
+        ($model_end5, $model_end3) = ($gene_end5, $gene_end3);
+    }
+    
+    $self->{gene_span} = [$gene_end5, $gene_end3];
+    $self->{gene_length} = abs ($gene_end3 - $gene_end5) + 1;
+    $self->{mid_pt} = int (($gene_end5 + $gene_end3)/2);
+    $self->{model_span} = [$model_end5, $model_end3]; 
+    
+    ## Refine isoforms if they exist.
+    if (my @isoforms = $self->get_additional_isoforms()) {
+        my @gene_span_coords = $self->get_gene_span();
+        foreach my $isoform (@isoforms) {
+            $isoform->refine_gene_object();
+            push (@gene_span_coords, $isoform->get_gene_span());
+        }
+        @gene_span_coords = sort {$a<=>$b} @gene_span_coords;
+        my $lend = shift @gene_span_coords;
+        my $rend = pop @gene_span_coords;
+        my $strand = $self->{strand};
+        if ($strand eq '-') {
+            ($lend, $rend) = ($rend, $lend);
+        }
+        my $gene_length = abs ($lend -$rend) + 1;
+        foreach my $gene ($self, @isoforms) {
+            $gene->{gene_span} = [$lend, $rend];
+            $gene->{gene_length} = $gene_length;
+        }
+    }
+    
+}
+
+
+
+
+=over 4
+
+=item add_mRNA_exon_obj()
+
+B<Description:> Used to add a single mRNA_exon_obj to the Gene_obj 
+
+B<Parameters:> mRNA_exon_obj
+
+B<Returns:> none
+
+=back
+
+=cut
+
+    ;
+
+sub add_mRNA_exon_obj {
+    my ($self) = shift;
+    my ($mRNA_exon_obj) = shift;
+    if (!ref($self->{mRNA_exon_objs})) {
+        $self->{mRNA_exon_objs} = [];
+    } 
+    my $index = $#{$self->{mRNA_exon_objs}};
+    $index++;
+    $self->{mRNA_exon_objs}->[$index] = $mRNA_exon_obj;
+}
+
+#private
+## forcibly set protein sequence value
+
+
+sub set_protein_sequence {
+    my $self = shift;
+    my $protein = shift;
+    if ($protein) {
+        $self->{protein_seq} = $protein;
+        $self->{protein_seq_length} = length ($protein);
+    } else {
+        print STDERR "No incoming protein sequence to set to.\n" . $self->toString();
+    }
+}
+
+#private
+## forcibly set CDS sequence value
+sub set_CDS_sequence {
+    my $self = shift;
+    my $cds_seq = shift;
+    if ($cds_seq) {
+        $self->{CDS_sequence} = $cds_seq;
+        $self->{CDS_sequence_length} = length ($cds_seq);
+    } else {
+        print STDERR "No incoming CDS sequence to set to\n" . $self->toString();
+    }
+}
+
+#private
+sub set_cDNA_sequence {
+    my $self = shift;
+    my $cDNA_seq = shift;
+    if ($cDNA_seq) {
+        $self->{cDNA_sequence} = $cDNA_seq;
+        $self->{cDNA_sequence_length} = length($cDNA_seq);
+    } else {
+        print STDERR "No incoming cDNA sequence to set to.\n" . $self->toString();
+    }
+}
+
+#private
+sub set_gene_sequence {
+    my $self = shift;
+    my $seq = shift;
+    if ($seq) {
+        $self->{gene_sequence} = $seq;
+        $self->{gene_sequence_length} = length ($seq);
+    } else {
+        print STDERR "No incoming gene sequence to set to\n" . $self->toString();
+    }
+}
+
+
+=over 4
+
+=item create_all_sequence_types()
+
+B<Description:> Given a scalar reference to the genomic sequence, the CDS, cDNA, unspliced transcript and protein sequences are constructed and populated within the Gene_obj
+
+B<Parameters:> $genomic_seq_ref, [%params]
+
+B<Returns:> 0|1
+
+returns 1 upon success, 0 upon failure
+
+By default, the protein and CDS sequence are populated.  If you want the unspliced genomic sequence, you need to specify this in the attributes:
+
+    %params = ( potein => 1,
+		CDS => 1,
+		cDNA => 1,
+		unspliced_transcript => 0)
+
+
+=back
+
+=cut
+
+
+## Create all gene sequences (protein, cds, cdna, genomic)
+sub create_all_sequence_types {
+    my $self = shift;
+    my $big_seq_ref = shift;
+    my %atts = @_;
+    
+    unless (ref($big_seq_ref) eq 'SCALAR') {
+        print STDERR "I require a sequence reference to create sequence types\n";
+        return (undef());
+    }
+    $self->create_cDNA_sequence($big_seq_ref) unless (exists($atts{cDNA}) && $atts{cDNA});
+    $self->create_gene_sequence($big_seq_ref, 1) if ($atts{unspliced_transcript}); #highlight exons by default.
+    
+    if ($self->is_coding_gene()) {
+        $self->create_CDS_sequence ($big_seq_ref) unless (exists ($atts{CDS}) && $atts{CDS});
+        $self->create_protein_sequence($big_seq_ref) unless (exists ($atts{protein}) && $atts{protein});
+    }
+    
+    if (my @isoforms = $self->get_additional_isoforms()) {
+        foreach my $isoform (@isoforms) {
+            $isoform->create_all_sequence_types($big_seq_ref, %atts);
+        }
+    }
+    return(1);
+}
+
+#private
+## Create cDNA sequence
+sub create_cDNA_sequence {
+    my $self = shift;
+    my $seq_ref = shift;
+    my $sequence_ref;
+    unless ($seq_ref) {
+        print STDERR "The parent sequence must be specified for the cDNA creation method\n";
+        return;
+    }
+    ## hopefully the sequence came in as a reference.  If not, make one to it.
+    ## Don't want to pass chromosome sequences in by value!
+    if (ref($seq_ref)) {
+        $sequence_ref = $seq_ref;
+    } else {
+        $sequence_ref = \$seq_ref;
+    }
+    my @exons = $self->get_exons();
+    my $strand = $self->{strand};
+    my $cDNA_seq = "";
+    foreach my $exon_obj (sort {$a->{end5}<=>$b->{end5}} @exons) {
+        my $c1 = $exon_obj->{end5};
+        my $c2 = $exon_obj->{end3};
+        ## sequence retrieval coordinates must be in forward orientation
+        my ($coord1, $coord2) = ($strand eq '+') ? ($c1, $c2) : ($c2, $c1);
+        $cDNA_seq .= substr ($$sequence_ref, ($coord1 - 1), ($coord2 - $coord1 + 1));
+    }
+    if ($strand eq '-') {
+        $cDNA_seq = &reverse_complement($cDNA_seq);
+    }
+    $self->set_cDNA_sequence($cDNA_seq);
+    return ($cDNA_seq);
+}
+
+#private
+## create a CDS sequence, and populate the protein field.
+sub create_CDS_sequence {
+    my $self = shift;
+    my $seq_ref = shift;
+    my $sequence_ref;
+    unless ($seq_ref) {
+        print STDERR "The parent sequence must be specified for the CDS creation method\n";
+        return;
+    }
+    
+    unless ($self->is_coding_gene()) {
+        print STDERR "Warning: No coding region specified for gene: " . $self->toString();
+        return("");
+    }
+    
+
+    ## hopefully the sequence came in as a reference.  If not, make one to it.
+    ## Don't want to pass chromosome sequences in by value!
+    if (ref($seq_ref)) {
+        $sequence_ref = $seq_ref;
+    } else {
+        $sequence_ref = \$seq_ref;
+    }
+    my @exons = $self->get_exons();
+    my $strand = $self->{strand};
+    my $cds_seq = "";
+    foreach my $exon_obj (sort {$a->{end5}<=>$b->{end5}} @exons) {
+        my $CDS_obj = $exon_obj->get_CDS_obj();
+        if (ref $CDS_obj) {
+            my ($c1, $c2) = $CDS_obj->get_CDS_end5_end3();
+            ## sequence retrieval coordinates must be in forward orientation
+            my ($coord1, $coord2) = ($strand eq '+') ? ($c1, $c2) : ($c2, $c1);
+            $cds_seq .= substr ($$sequence_ref, ($coord1 - 1), ($coord2 - $coord1 + 1));
+        }
+    }
+    if ($strand eq '-') {
+        $cds_seq = &reverse_complement($cds_seq);
+    }
+    $self->set_CDS_sequence($cds_seq);
+    
+    return ($cds_seq);
+}
+
+
+
+sub is_coding_gene {
+    my $self = shift;
+   
+    if ($self->get_CDS_length()) {
+        return(1);
+    }
+    else {
+        return(0);
+    }
+}
+
+
+
+#private
+## Translation requires parent nucleotide sequence (bac, chromosome, whatever).
+sub create_protein_sequence {
+    my $self = shift;
+    my $seq_ref = shift; # optional
+    
+    unless ($self->is_coding_gene()) {
+        print STDERR "Warning: No coding sequence for gene: " . $self->toString();
+        return("");
+    }
+    
+    my $cds_sequence = $self->get_CDS_sequence();
+    unless ($cds_sequence) {
+            
+        ## if has a CDS, then try to translate it if the genome sequence is available.
+    
+        unless (ref($seq_ref) eq 'SCALAR') {
+            print STDERR "I require an assembly sequence ref if the CDS is unavailable\n";
+            return;
+        }
+        $cds_sequence = $self->create_CDS_sequence($seq_ref);
+    }
+    my $protein = &Nuc_translator::get_protein ($cds_sequence); 
+    $self->set_protein_sequence($protein);
+    return ($protein);
+}
+
+#private
+## Create the unspliced nucleotide transcript
+sub create_gene_sequence {
+    my $self = shift;
+    my $big_seq_ref = shift;
+    my $highlight_exons_flag = shift; #upcases exons, lowcases introns.
+    unless (ref ($big_seq_ref) eq 'SCALAR') {
+        print STDERR "I require a reference to the assembly sequence!!\n";
+        return (undef());
+    }
+    my $strand = $self->{strand};
+    my ($gene_seq);
+    if ($highlight_exons_flag) {
+        my @exons = sort {$a->{end5}<=>$b->{end5}} $self->get_exons();
+        my $exon = shift @exons;
+        my ($lend, $rend) = sort {$a<=>$b} $exon->get_coords();
+        $gene_seq = uc (substr ($$big_seq_ref, $lend - 1, $rend - $lend + 1));
+        my $prev_rend = $rend;
+        while (@exons) {
+            $exon = shift @exons;
+            ## Add intron, then exon
+            my ($lend, $rend) = sort {$a<=>$b} $exon->get_coords();
+            $gene_seq .= lc (substr ($$big_seq_ref, $prev_rend, $lend - $prev_rend-1));
+            $gene_seq .= uc (substr ($$big_seq_ref, $lend - 1, $rend - $lend + 1));
+            $prev_rend = $rend;
+        }
+        
+    } else { #just get the sequence spanned by min and max coords
+        my ($coord1, $coord2) = sort {$a<=>$b} $self->get_gene_span();
+        $gene_seq = substr ($$big_seq_ref, ($coord1 - 1), ($coord2 - $coord1 + 1));
+    }
+    
+    $gene_seq = &reverse_complement($gene_seq) if ($strand eq '-');
+    $self->set_gene_sequence($gene_seq);
+    return ($gene_seq);
+}
+
+## retrieving the sequences
+
+=over 4
+
+=item get_protein_sequence()
+
+B<Description:> Retrieves the protein sequence
+
+B<Parameters:> none
+
+B<Returns:> $protein
+
+Note: You must have called create_all_sequence_types($genomic_ref) before protein sequence is available for retrieval.
+
+
+=back
+
+=cut
+    
+    ;
+
+sub get_protein_sequence {
+    my $self = shift;
+    return ($self->{protein_seq});
+}
+
+## alias
+sub get_protein_seq {
+    my $self = shift;
+    return ($self->get_protein_sequence());
+}
+
+
+
+=over 4
+
+=item get_CDS_sequence()
+
+B<Description:> Retrieves the CDS sequence.  The CDS sequence is the protein-coding nucleotide sequence.
+
+B<Parameters:> none
+
+B<Returns:> $cds
+
+Note: You must have called create_all_sequence_types($genomic_ref) before protein sequence is available for retrieval.
+
+=back
+
+=cut
+
+
+sub get_CDS_sequence {
+    my $self = shift;
+    return ($self->{CDS_sequence});
+}
+
+=over 4
+
+=item get_cDNA_sequence()
+
+B<Description:> Retrieves the tentative cDNA sequence for the Gene.  The cDNA includes the CDS with potential UTR extensions.
+
+B<Parameters:> none
+
+B<Returns:> $cdna
+
+Note: You must have called create_all_sequence_types($genomic_ref) before protein sequence is available for retrieval.
+
+
+=back
+
+=cut
+
+
+
+sub get_cDNA_sequence {
+    my $self = shift;
+    return ($self->{cDNA_sequence});
+}
+
+
+
+sub get_CDS_length {
+    my $self = shift;
+    
+    my $cds_length = 0;
+    
+    my @exons = $self->get_exons();
+    foreach my $exon (@exons) {
+        if (my $cds = $exon->get_CDS_obj()) {
+            $cds_length += $cds->length();
+        }
+    }
+    
+    
+    return ($cds_length);
+}
+
+sub get_cDNA_length {
+    my $self = shift;
+    
+    my $cdna_length = 0;
+    
+    my @exons = $self->get_exons();
+    foreach my $exon (@exons) {
+        $cdna_length += $exon->length();
+    }
+
+    return($cdna_length);
+    
+}
+
+
+
+
+
+
+
+=over 4
+
+=item get_gene_sequence()
+
+B<Description:> Retrieves the unspliced transcript of the gene.
+
+B<Parameters:> none
+
+B<Returns:> $unspliced_transcript
+
+=back
+
+=cut
+
+
+sub get_gene_sequence {
+    my $self = shift;
+    return ($self->{gene_sequence});
+}
+
+
+
+
+=over 4
+
+=item get_gene_synonyms()
+
+B<Description:> Retrieves the Model_feat_name(s) for the synonomous gene models found on other BACs or contigs.
+
+B<Parameters:> none
+
+B<Returns:> @model_feat_names
+
+
+For Arabidopsis, gene models are found within overlapping regions of BAC sequences, in which the gene models are annotated on both corresponding BACs.  Given a Gene_obj for a model on one BAC, the synomous gene on the overlapping BAC can be identified via this method.
+
+
+=back
+
+=cut
+
+
+sub get_gene_synonyms {
+    my $self = shift;
+    return (@{$self->{gene_synonyms}});
+}
+
+
+
+=over 4
+
+=item clear_sequence_info()
+
+B<Description:> Clears the sequence fields stored within a Gene_obj, including the CDS, cDNA, gene_sequence, and protein sequence.  Often, these sequence fields, when populated, can consume large amounts of memory in comparison to the coordinate and functional annotation data.  This method is useful to clear this memory when the sequences are not needed.  The create_all_sequence_types($genomic_seq_ref) can be called again later to repopulate these sequences when they are needed.
+
+B<Parameters:> none
+
+B<Returns:> none
+
+=back
+
+=cut
+
+
+## sequences consume huge amounts of memory in comparison to other gene features.
+## want to clear them from time to time to save memory.
+    
+    ;
+
+sub clear_sequence_info {
+    my $self = shift;
+    $self->{protein_seq} = undef;   
+    $self->{CDS_sequence} = undef;
+    $self->{cDNA_sequence} = undef; 
+    $self->{gene_sequence} = undef;
+}
+
+
+=over 4
+
+=item set_gene_type()
+
+B<Description:> Sets the type of gene.  Expected types include: 
+
+    protein-coding #default setting
+    rRNA
+    snoRNA
+    snRNA
+    tRNA
+    
+    ...or others as needed.  Nothing is restricted.
+
+B<Parameters:> $type
+
+B<Returns:> none
+
+=back
+
+=cut
+
+
+####
+sub set_gene_type {
+    my ($self) = shift;
+    my ($gene_type) = shift;
+    $self->{gene_type} = $gene_type;
+}
+
+
+=over 4
+
+=item adjust_gene_coordinates()
+
+B<Description:> Used to add or subtract a specified number of bases from each gene component coordinate.
+
+B<Parameters:> $adj_amount
+
+$adj_amoount is a positive or negative integer.
+
+B<Returns:> none
+
+=back
+
+=cut
+
+
+    ;
+
+####
+# add value to all gene component coordinates
+sub adjust_gene_coordinates {
+    my $self = shift;
+    my $adj_amount = shift;
+    my @exons = $self->get_exons();
+    foreach my $exon (@exons) {
+        my ($end5, $end3) = $exon->get_coords();
+        $exon->set_coords($end5 + $adj_amount, $end3 + $adj_amount);
+        my $cds = $exon->get_CDS_obj();
+        if (ref $cds) {
+            my ($end5, $end3) = $cds->get_coords();
+            $cds->set_coords($end5 + $adj_amount, $end3 + $adj_amount);
+        }
+    }
+    
+    ## don't forget about alt splicing isoforms!
+    my @isoforms = $self->get_additional_isoforms();
+    foreach my $isoform (@isoforms) {
+        $isoform->adjust_gene_coordinates($adj_amount);
+    }
+    $self->refine_gene_object();
+}
+
+
+
+
+=over 4
+
+=item toString()
+
+B<Description:> Textually describes the Gene_obj including coordinates and attributes.
+
+B<Parameters:> <%attributes_list> 
+
+%attributes_list is optional and can control whether certain attributes are included in the textual output
+
+Default settings are:
+
+    %attributes_list = ( 
+			 -showIsoforms => 1,  #set to 0 to avoid isoform info to the text output.
+			 -showSeqs => 0  #set to 1 for avoiding protein, cds, genomic, cdna seqs as output.
+			 )
+
+B<Returns:> $text
+
+=back
+
+=cut
+
+    ;
+
+
+## retrieve text output describing the gene.
+sub toString {
+    my $self = shift;
+    my %atts = @_;
+    # atts defaults:
+    #       -showIsoforms=>1
+    #       -showSeqs => 0
+    
+    my $output = "";
+    foreach my $key (keys %$self) {
+        my $value = $self->{$key};
+        unless (defined $value) { next;}
+        if (ref $value) {
+            if ($key =~ /secondary/ && ref $value eq "ARRAY") {
+                foreach my $val (@$value) {
+                    $output .= "\t\t$key\t$val\n";
+                }
+            }
+            
+            
+        } else {
+            if ($self->{is_pseudogene} && $key =~ /cds|cdna|protein/i && $key =~ /seq/) {
+                next;
+            }
+            if ((!$atts{-showSeqs}) && $key =~/seq/) { next; }
+            if ( ($value eq '0' || !defined($value)) && $key !~/^is_/) { next;} #dont print unpopulated info.
+            $output .= "\t$key:\t$value\n";
+        }
+    }
+    $output .= "\tgene_synonyms: @{$self->{gene_synonyms}}\n";
+    
+    $output .=  "\tmRNA_coords\t";  
+    
+    if (ref ($self->{mRNA_coords}) eq "HASH") {
+        foreach my $end5 (sort {$a<=>$b} keys %{$self->{mRNA_coords}}) {
+            $output .=  "$end5-$self->{mRNA_coords}->{$end5} ";
+        }
+    }
+    $output .= "\n"
+        . "\tCDS_coords\t";
+    if (ref ($self->{CDS_coords}) eq "HASH") {
+        foreach my $end5 (sort {$a<=>$b} keys %{$self->{CDS_coords}}) {
+            $output .= "$end5-$self->{CDS_coords}->{$end5} ";
+        }
+    }
+    
+    my @exons = $self->get_exons();
+    foreach my $exon (@exons) {
+        $output .=  "\n\t\tRNA-exon: $exon->{end5}, $exon->{end3}\t";
+        my $cds = $exon->{CDS_exon_obj};
+        if ($cds) {
+            $output .= "CDS-exon: $cds->{end5}, $cds->{end3}";
+        }
+    }
+    
+    if (ref $self->{gene_span}) {
+        my ($gene_end5, $gene_end3) = @{$self->{gene_span}};
+        $output .= "\n\tgene_span: $gene_end5-$gene_end3";
+    }
+    if (ref $self->{model_span}) {
+        my ($model_end5, $model_end3) = @{$self->{model_span}};
+        $output .= "\n\tmodel_span: $model_end5-$model_end3";
+    }
+    my @gene_ontology_objs = $self->get_gene_ontology_objs();
+    if (@gene_ontology_objs) {
+        $output .= "\n\tGene Ontology Assignments:\n";
+        foreach my $go_assignment (@gene_ontology_objs) {
+            $output .= "\t" . $go_assignment->toString();
+        }
+    }
+    
+    unless (defined ($atts{-showIsoforms}) && $atts{-showIsoforms} == 0) {
+        foreach my $isoform ($self->get_additional_isoforms()) {
+            $output .= "\n\n\tISOFORM:\n" . $isoform->toString();
+        }
+    }
+    $output .= "\n\n"; #spacer at terminus
+    return ($output);
+}
+
+
+####
+## Splice site validation section
+####
+
+=over 4
+
+=item validate_splice_sites()
+
+B<Description:> Validates the presence of consensus splice sites 
+
+B<Parameters:> $genomic_seq_ref
+
+$genomic_seq_ref is a scalar reference to the string containing the genomic sequence.
+
+B<Returns:> $errors
+
+If the empty string ("") is returned, then no inconsistencies were identified.
+
+=back
+
+=cut
+
+    ;
+    
+####
+sub validate_splice_sites {
+    my $self = shift;
+    my $asmbl_seq_ref = shift;
+    unless (ref ($asmbl_seq_ref)) {
+        print STDERR "I require a sequence reference\n";
+        return (undef());
+    }
+    my $error_string = "";
+    my $strand = $self->{strand};
+    my @exons = $self->get_exons();
+    my $num_exons = $#exons + 1;
+    if ($num_exons == 1) {
+        #no splice sites to confirm.
+        return ("");
+    }
+    for (my $i = 1; $i <= $num_exons; $i++) {
+        my $exon_type;
+        if ($i == 1) { 
+            $exon_type = "initial";
+        } elsif ($i == $num_exons) {
+            $exon_type = "terminal";
+        } else {
+            $exon_type = "internal";
+        }
+        my $exon = $exons[$i - 1]; 
+        my ($exon_end5, $exon_end3) = $exon->get_mRNA_exon_end5_end3();
+        my ($coord1, $coord2) = sort {$a<=>$b} ($exon_end5, $exon_end3);
+        ## get two coordinate sets corresponding to potential splice sites
+        my $splice_1_start = $coord1-2-1;
+        my $splice_2_start = $coord2-1+1;
+        #print "confirming splice sites at "  . ($splice_1_start +1) . " and " . ($splice_2_start + 1) . "\n"if $SEE;
+        my $splice_1 = substr ($$asmbl_seq_ref, $splice_1_start, 2);
+        my $splice_2 = substr ($$asmbl_seq_ref, $splice_2_start, 2);
+        my ($acceptor, $donor) = ($strand eq '+') ? ($splice_1, $splice_2) : (&reverse_complement($splice_2), &reverse_complement($splice_1)); 
+        my $check_acceptor = ($acceptor =~ /ag/i);
+        my $check_donor = ($donor =~ /gt|gc/i);
+        ## associate results of checks with exon type.
+        if ($exon_type eq "initial" || $exon_type eq "internal") {
+            unless ($check_donor) {
+                $error_string .= "non-consensus $donor donor splice site at $coord1\n";
+            }
+        }
+        
+        if ($exon_type eq "internal" || $exon_type eq "terminal") {
+            unless ($check_acceptor) {
+                $error_string .=  "\tnon-consensus $acceptor acceptor splice site at $coord2\n";
+            }
+        }
+    }
+    return ($error_string);
+}
+
+
+
+=over 4
+
+=item get_annot_text()
+
+B<Description:> Provides basic functional annotation for a Gene_obj 
+
+B<Parameters:> none
+
+B<Returns:> $string
+
+$string includes locus, pub_locus, com_name, and pub_comment
+
+=back
+
+=cut
+
+
+    ;
+
+####
+sub get_annot_text {
+    my $self = shift;
+    my $locus = $self->{locus};
+    my $pub_locus = $self->{pub_locus};
+    my $com_name = $self->{com_name};
+    my $pub_comment = $self->{pub_comment};
+    my $text = "";
+    foreach my $token ($locus, $pub_locus, $com_name, $pub_comment) {
+        if ($token) {
+            $text .= "$token ";
+        }
+    }
+    return ($text);
+}
+
+
+
+=over 4
+
+=item add_isoform()
+
+B<Description:> Adds a Gene_obj to an existing Gene_obj as an alternative splicing variant.
+
+B<Parameters:> Gene_obj
+
+B<Returns:> none
+
+=back
+
+=cut
+
+    ;
+sub add_isoform {
+    my $self = shift;
+    my @gene_objs = @_;
+    foreach my $gene_obj (@gene_objs) {
+        $self->{num_additional_isoforms}++;
+        push (@{$self->{additional_isoforms}}, $gene_obj);
+    }
+}
+
+
+
+
+
+=over 4
+
+=item has_additional_isoforms()
+
+B<Description:> Provides number of additional isoforms.  Typically used as a boolean.
+
+B<Parameters:> none
+
+B<Returns:> number of additional isoforms (int)
+
+If no additional isoforms exist, returns 0
+
+
+boolean usage:
+
+0 = false (has no more)
+nonzero = true (has additional isoforms)
+
+=back
+
+=cut
+
+sub has_additional_isoforms {
+    my $self = shift;
+    return ($self->{num_additional_isoforms});
+}
+
+
+
+=over 4
+
+=item delete_isoforms()
+
+B<Description:> removes isoforms stored in this Gene_obj (assigning to a new anonymous arrayref)
+
+B<Parameters:> Gene_obj
+
+B<Returns:> none
+
+=back
+
+=cut
+
+sub delete_isoforms {
+    my $self = shift;
+    $self->{num_additional_isoforms} = 0;
+    $self->{additional_isoforms} = [];
+}
+
+
+
+
+
+=over 4
+
+=item get_additional_isoforms()
+
+B<Description:> Retrieves the additional isoforms for a given Gene_obj
+
+B<Parameters:> none
+
+B<Returns:> @Gene_objs
+
+If no additional isoforms exist, an empty array is returned.
+
+=back
+
+=cut
+
+
+sub get_additional_isoforms {
+    my $self = shift;
+    return (@{$self->{additional_isoforms}});
+}
+
+
+
+=over 4
+
+=item get_orientation()
+
+B<Description:> Retrieves the strand orientation of the Gene_obj
+
+B<Parameters:> none
+
+B<Returns:> +|-
+
+=back
+
+=cut
+
+
+sub get_orientation {
+    my $self = shift;
+    return ($self->{strand});
+}
+
+
+
+sub get_strand { ## preferred
+	my $self = shift;
+	return($self->get_orientation());
+}
+
+
+
+=over 4
+
+=item add_gene_ontology_objs()
+
+B<Description:> Adds a list of Gene_ontology objects to a Gene_obj
+
+B<Parameters:> @Gene_ontology_objs
+
+ at Gene_ontology_objs is a list of objects instantiated from Gene_ontology.pm
+
+B<Returns:> none
+
+=back
+
+=cut
+
+
+sub add_gene_ontology_objs {
+    my ($self, @ontology_objs) = @_;
+    push (@{$self->{GeneOntology}}, @ontology_objs);
+}
+
+
+
+=over 4
+
+=item get_gene_ontology_objs()
+
+B<Description:> Retrieves Gene_ontology objs assigned to the Gene_obj
+
+B<Parameters:> none
+
+B<Returns:> @Gene_ontology_objs
+
+ at Gene_ontology_objs are objects instantiated from package Gene_ontology  (See Gene_ontology.pm)
+
+=back
+
+=cut
+
+    ;
+
+sub get_gene_ontology_objs {
+    my $self = shift;
+    if (ref ($self->{GeneOntology})) {
+        return (@{$self->{GeneOntology}});
+    } else {
+        return (());
+    }
+}
+
+
+=over 4
+
+=item set_5prime_partial()
+
+B<Description:> Sets the status of the is_5prime_partial attribute
+
+B<Parameters:> 1|0
+
+B<Returns:> none
+
+
+5prime partials are partial on their 5prime end and lack start codons.
+
+
+=back
+
+=cut
+
+sub set_5prime_partial() {
+    my $self = shift;
+    my $value = shift;
+    $self->{is_5prime_partial} = $value;
+}
+
+
+
+=over 4
+
+=item set_3prime_partial()
+
+B<Description:> Sets the is_3prime_partial status
+
+B<Parameters:> 1|0
+
+B<Returns:> none
+
+3prime partials are partial on their 3prime end and lack stop codons.
+
+=back
+
+=cut
+
+
+sub set_3prime_partial() {
+    my $self = shift;
+    my $value = shift;
+    $self->{is_3prime_partial} = $value;
+}
+
+
+
+=over 4
+
+=item is_5prime_partial()
+
+B<Description:> Retrieves the 5-prime partial status of the gene.
+
+B<Parameters:> none
+
+B<Returns:> 1|0
+
+=back
+
+=cut
+
+
+sub is_5prime_partial() {
+    my $self = shift;
+    return ($self->{is_5prime_partial});
+}
+
+
+=over 4
+
+=item is_3prime_partial()
+
+B<Description:> Retrieves the 3-prime partial status of the gene.
+
+B<Parameters:> none
+
+B<Returns:> 1|0
+
+=back
+
+=cut
+
+
+sub is_3prime_partial() {
+    my $self = shift;
+    return ($self->{is_3prime_partial});
+}
+
+=over 4
+
+=item get_5prime_UTR_coords
+	
+
+B<Description:> returns a list of coordinate pairs corresponding to the 5\' UTR coordinates
+
+B<Parameters:> none
+
+B<Returns:> ([end5,end3], ...) or empty list if none exist
+
+=back
+
+=cut
+
+
+    ;
+
+sub get_5prime_UTR_coords {
+    my $self = shift;
+    
+    my $strand = $self->get_orientation();
+    
+    my @exons = $self->get_exons();
+    
+    my $seen_CDS_flag = 0;
+    
+    my @utr_coords;
+    foreach my $exon (@exons) { #relying on a sorted list
+        my ($exon_end5, $exon_end3) = $exon->get_coords();
+        if (my $cds = $exon->get_CDS_obj()) {
+            my ($cds_end5, $cds_end3) = $cds->get_coords();
+            if ($exon_end5 != $cds_end5) {
+                my $adj_utr_end3_coord = ($strand eq '+') ? ($cds_end5 -1) : ($cds_end5 +1);
+                push (@utr_coords, [$exon_end5, $adj_utr_end3_coord]);
+            } 
+            
+            $seen_CDS_flag = 1;
+            
+        } else {
+            push (@utr_coords, [$exon_end5, $exon_end3]);
+        }
+        
+        if ($seen_CDS_flag) {
+            last;
+        }
+        
+    }
+    
+    return (@utr_coords);
+}
+
+
+
+=over 4
+
+=item get_3prime_UTR_coords
+	
+
+B<Description:> returns a list of coordinate pairs corresponding to the 3\' UTR coordinates
+
+B<Parameters:> none
+
+B<Returns:> ([end5,end3], ...) or empty list if none exist
+
+=back
+
+=cut
+
+    ;
+
+sub get_3prime_UTR_coords {
+    my $self = shift;
+    
+    my $strand = $self->get_orientation();
+    
+    my @exons = reverse $self->get_exons();
+    
+    my @utr_coords;
+    my $seen_CDS_flag = 0;
+    foreach my $exon (@exons) { #relying on a reverse sorted list (3' exons should come first)
+        my ($exon_end5, $exon_end3) = $exon->get_coords();
+        if (my $cds = $exon->get_CDS_obj()) {
+            $seen_CDS_flag = 1;
+            my ($cds_end5, $cds_end3) = $cds->get_coords();
+            if ($exon_end3 != $cds_end3) {
+                my $adj_utr_end5_coord = ($strand eq '+') ? ($cds_end3 +1) : ($cds_end3 -1);
+                push (@utr_coords, [$adj_utr_end5_coord, $exon_end3]);
+            } 
+	  	    
+        } else {
+            push (@utr_coords, [$exon_end5, $exon_end3]);
+        }
+        if ($seen_CDS_flag) { 
+            last;
+        }
+    }
+
+    if (@utr_coords) {
+        @utr_coords = reverse @utr_coords;
+    }
+    
+    return (@utr_coords);
+}
+
+
+
+
+
+=over 4
+
+=item has_UTRs()
+
+B<Description:> indicates presence of UTR annotated in Gene 
+
+B<Parameters:> none
+
+B<Returns:> ( has_5prime_UTR() || has_3prime_UTR() )
+
+=back
+
+=cut
+
+sub has_UTRs {
+    my $self = shift;
+    return ( ($self->has_5prime_UTR() || $self->has_3prime_UTR() ) );
+}
+
+
+
+####
+sub has_5prime_UTR {
+    my $self = shift;
+    return (scalar ($self->get_5prime_UTR_coords()));
+}
+
+####
+sub has_3prime_UTR {
+    my $self = shift;
+    return(scalar ($self->get_3prime_UTR_coords()));
+}
+
+
+=over 4
+
+=item get_5prime_UTR_sequence()
+
+B<Description:> retrieves 5prime UTR sequence
+
+B<Parameters:> genome sequence reference
+
+B<Returns:> string
+
+=back
+
+=cut
+
+####
+sub get_5prime_UTR_sequence {
+    my $self = shift;
+    my ($genome_seq_ref) = @_;
+    unless (ref $genome_seq_ref eq "SCALAR") {
+        confess "error, require genome sequence string reference";
+    }
+
+    unless ($self->has_5prime_UTR()) {
+        return "";
+    }
+    
+    my $orientation = $self->get_orientation();
+    my @coords = $self->get_5prime_UTR_coords();
+    
+    @coords = sort {$a->[0]<=>$b->[0]} @coords;
+
+    my $UTR_seq = "";
+    foreach my $coordset (@coords) {
+        my ($lend, $rend) = sort {$a<=>$b} @$coordset;
+        
+        my $length = $rend - $lend + 1;
+        $UTR_seq .= substr($$genome_seq_ref, $lend - 1, $length);
+    }
+
+    if ($orientation eq '-') {
+        $UTR_seq = &reverse_complement($UTR_seq);
+    }
+
+    ## verify:
+    $self->create_all_sequence_types($genome_seq_ref);
+    my $cDNA = $self->get_cDNA_sequence();
+    
+
+    unless (index($cDNA, $UTR_seq) == 0) {
+        confess "Error, couldn't find UTR in cDNA";
+    }
+
+    
+    return ($UTR_seq);
+}
+        
+
+=over 4
+
+=item get_3prime_UTR_sequence()
+
+B<Description:> retrieves 5prime UTR sequence
+
+B<Parameters:> genome sequence reference
+
+B<Returns:> string
+
+=back
+
+=cut
+
+####
+sub get_3prime_UTR_sequence {
+    my $self = shift;
+    my ($genome_seq_ref) = @_;
+    unless (ref $genome_seq_ref eq "SCALAR") {
+        confess "error, require genome sequence string reference";
+    }
+
+    unless ($self->has_3prime_UTR()) {
+        return "";
+    }
+    
+    my $orientation = $self->get_orientation();
+    my @coords = $self->get_3prime_UTR_coords();
+    
+    @coords = sort {$a->[0]<=>$b->[0]} @coords;
+
+    my $UTR_seq = "";
+    foreach my $coordset (@coords) {
+        my ($lend, $rend) = sort {$a<=>$b} @$coordset;
+        
+        my $length = $rend - $lend + 1;
+        $UTR_seq .= substr($$genome_seq_ref, $lend - 1, $length);
+    }
+
+    if ($orientation eq '-') {
+        $UTR_seq = &reverse_complement($UTR_seq);
+    }
+
+    ## verify:
+    $self->create_all_sequence_types($genome_seq_ref);
+    my $cDNA = $self->get_cDNA_sequence();
+    my $cDNA_length = length($cDNA);
+    my $utr_length = length($UTR_seq);
+
+    my $utr_start_pos = $cDNA_length - $utr_length + 1;
+
+    unless ((my $cDNA_utr =  lc substr($cDNA, $utr_start_pos - 1, $utr_length)) eq lc $UTR_seq) {
+        confess "Error, 3' UTR extracted from cDNA is different from UTR sequence extracted from genome.\n"
+            . "cDNA_utr:\n$cDNA_utr\nUTR_from_genome:\n$UTR_seq\n\n";
+    }
+    
+    
+    return ($UTR_seq);
+}
+        
+
+
+
+=over 4
+
+=item trim_UTRs()
+
+B<Description:> Trims the UTR of the Gene_obj so that the Exon coordinates are identical to the CDS coordinates.  Exons which lack CDS components and are completely UTR are removed. 
+
+B<Parameters:> none
+
+B<Returns:> none
+
+=back
+
+=cut
+
+    ;
+
+sub trim_UTRs {
+    my $self = shift;
+    
+    ## adjust exon coordinates to CDS coordinates.
+    ## if cds doesn't exist, rid exon:
+    
+    my @new_exons;
+    
+    my @exons = $self->get_exons();
+    foreach my $exon (@exons) {
+        if (my $cds = $exon->get_CDS_obj()) {
+            my ($exon_end5, $exon_end3) = $exon->get_coords();
+            my ($cds_end5, $cds_end3) = $cds->get_coords();
+            
+            if ($exon_end5 != $cds_end5 || $exon_end3 != $cds_end3) {
+                $exon->set_coords($cds_end5, $cds_end3);
+            }
+            push (@new_exons, $exon);
+        }
+    }
+    $self->{mRNA_exon_objs} = 0; #clear current gene structure
+    $self->{mRNA_exon_objs} = \@new_exons; #replace gene structure
+    $self->refine_gene_object(); #update
+    return ($self);
+}
+
+
+
+
+=over 4
+    
+=item remove_CDS_exon()
+
+B<Description:> Removes any existing CDS_exon_obj from this mRNA_exon_obj
+
+B<Parameters:> none
+
+B<Returns:> none
+
+=back
+
+=cut
+
+sub remove_CDS_exon {
+    my $self = shift;
+    $self->{CDS_exon_obj} = 0;
+}
+
+
+
+
+
+=over 4
+
+=item get_gene_names()
+
+B<Description:> Retrieves gene names  (primary gene name followed by secondary gene names, "$;" delimited.
+
+B<Parameters:> none
+
+B<Returns:> string
+				       
+     see $gene_obj->{gene_name}
+     see $gene_obj->get_secondary_names()
+
+secondary gene names sorted lexicographically
+
+
+=back
+
+=cut
+
+
+
+
+####
+sub get_gene_names {
+    my $gene_obj = shift;
+    my @gene_names;
+    if ($gene_obj->{gene_name}) {
+        push (@gene_names, $gene_obj->{gene_name});
+    }
+    if (my @secondary_names = $gene_obj->get_secondary_gene_names()) {
+        push (@gene_names, @secondary_names);
+    }
+    my $ret_gene_names = join ("$;" , @gene_names);
+    return ($ret_gene_names);
+}
+
+
+
+=over 4
+
+=item get_secondary_gene_names()
+
+B<Description:> Retrieves secondary gene names as a "$;" delimited string.
+
+B<Parameters:> none
+
+B<Returns:> string
+
+=back
+
+=cut
+
+
+####
+sub get_secondary_gene_names {
+    my ($gene_obj) = @_;
+    return (sort @{$gene_obj->{secondary_gene_names}});
+}
+
+
+
+
+=over 4
+
+=item get_product_names()
+
+B<Description:> Retrieves product name, with the primary product name followed by secondary product names, delimited by "$;"
+
+B<Parameters:> none
+
+B<Returns:> string
+
+    see $gene_obj->{com_name} for primary product name
+    see $gene_obj->get_secondary_product_names()
+
+=back
+
+=cut
+
+    ;
+
+####
+sub get_product_names {
+    my $gene_obj = shift;
+    my @product_names;
+    if ($gene_obj->{com_name}) {
+        push (@product_names, $gene_obj->{com_name});
+    }
+    if (my @secondary_names = $gene_obj->get_secondary_product_names()) {
+        push (@product_names, @secondary_names);
+    }
+    my $ret_product_names = join ("$;", @product_names);
+    return ($ret_product_names);
+}
+
+
+
+=over 4
+
+=item get_secondary_product_names()
+
+B<Description:> Retrieves secondary product names, delimited by "$;" and sorted lexicographically.
+
+B<Parameters:> none 
+
+B<Returns:> string
+
+=back
+
+=cut
+
+
+####
+sub get_secondary_product_names {
+    my ($gene_obj) = @_;
+    return (sort @{$gene_obj->{secondary_product_names}});
+}
+
+
+
+=over 4
+
+=item get_gene_symbols()
+
+B<Description:> Retrieves primary gene symbol followed by secondary gene symbols, delimited by "$;"
+
+B<Parameters:> none
+
+B<Returns:> string
+
+    see $gene_obj->{gene_sym}
+    see $gene_obj->get_secondary_gene_symbols()
+
+=back
+
+=cut
+
+    ;
+
+####
+sub get_gene_symbols {
+    my $gene_obj = shift;
+    my @gene_symbols;
+    if ($gene_obj->{gene_sym}) {
+        push (@gene_symbols, $gene_obj->{gene_sym});
+    }
+    if (my @secondary_symbols = $gene_obj->get_secondary_gene_symbols()) {
+        push (@gene_symbols, @secondary_symbols);
+    }
+    my $ret_gene_symbols = join ("$;", @gene_symbols);
+    return ($ret_gene_symbols);
+}
+
+
+=over 4
+
+=item get_secondary_gene_symbols()
+
+B<Description:> Retrieves secondary gene symbols, delimited by "$;" and sorted lexicographically
+
+B<Parameters:> none
+
+B<Returns:> string
+
+=back
+
+=cut
+
+
+####
+sub get_secondary_gene_symbols {
+    my ($gene_obj) = @_;
+    return (sort @{$gene_obj->{secondary_gene_symbols}});
+}
+
+
+
+=over 4
+
+=item get_ec_numbers()
+
+B<Description:> Retrieves primary EC number followed by secondary EC numbers, "$;" delimited
+
+B<Parameters:> none
+
+B<Returns:> string
+
+    see $gene_obj->{ec_num}
+    see $gene_obj->get_secondary_ec_numbers()
+    
+=back
+
+=cut
+
+    ;
+
+####
+sub get_ec_numbers {
+    my $gene_obj = shift;
+    my @ec_numbers;
+    if ($gene_obj->{ec_num}) {
+        push (@ec_numbers, $gene_obj->{ec_num});
+    }
+    if (my @secondary_ec_numbers = $gene_obj->get_secondary_ec_numbers()) {
+        push (@ec_numbers, @secondary_ec_numbers);
+    }
+    my $ret_ec_numbers = join ("$;", @ec_numbers);
+    return ($ret_ec_numbers);
+}
+
+
+
+=over 4
+
+=item get_secondary_ec_numbers()
+
+B<Description:> Retrieves secondary EC numbers, "$;" delimited and sorted lexicographically
+
+B<Parameters:> none
+
+B<Returns:> string
+
+
+=back
+
+=cut
+
+
+####
+sub get_secondary_ec_numbers {
+    my ($gene_obj) = @_;
+    return (sort @{$gene_obj->{secondary_ec_numbers}});
+}
+
+
+
+=over 4
+
+=item add_secondary_gene_names()
+
+B<Description:> Adds secondary gene name(s) 
+
+B<Parameters:> (gene_name_1, gene_name_2, ....)
+
+Single gene name or list of gene names is allowed
+
+
+B<Returns:> none
+
+=back
+
+=cut
+
+
+
+####
+sub add_secondary_gene_names {
+    my ($gene_obj, @gene_names) = @_;
+    push (@{$gene_obj->{secondary_gene_names}}, @gene_names);
+}
+
+
+=over 4
+
+=item add_secondary_product_names()
+
+B<Description:> Adds secondary product names
+
+B<Parameters:> (product_name_1, product_name_2, ...)
+
+Single or list of product names as parameter
+
+B<Returns:> none
+
+Primary gene name added directly as an attribute like so
+    $gene_obj->{gene_name} = name
+
+=back
+
+=cut
+
+
+####
+sub add_secondary_product_names {
+    my ($gene_obj, @product_names) = @_;
+    &trim_leading_trailing_ws(\@product_names);
+    push (@{$gene_obj->{secondary_product_names}}, @product_names);
+}
+
+
+=over 4
+
+=item add_secondary_gene_symbols()
+
+B<Description:> Add secondary gene symbols
+
+B<Parameters:> (gene_symbol_1, gene_symbol_2, ...)
+
+String or list context
+
+B<Returns:> none
+
+Primary gene_symbol added directly as attribute like so:
+    $gene_obj->{gene_sym} = symbol
+
+=back
+
+=cut
+
+
+####
+sub add_secondary_gene_symbols {
+    my ($gene_obj, @gene_symbols) = @_;
+    &trim_leading_trailing_ws(\@gene_symbols);
+    push (@{$gene_obj->{secondary_gene_symbols}}, @gene_symbols);
+}
+
+
+
+
+
+=over 4
+
+=item add_secondary_ec_numbers()
+
+B<Description:> Add secondary Enzyme Commission (EC) numbers
+
+B<Parameters:> (EC_1, EC_2, ...)
+
+String or list context
+
+B<Returns:> none
+
+
+Primary EC number added directly as an attribute like so:
+    $gene_obj->{ec_num} = EC_number
+
+=back
+
+=cut
+
+
+####
+sub add_secondary_ec_numbers {
+    my ($gene_obj, @ec_numbers) = @_;
+    &trim_leading_trailing_ws(\@ec_numbers);
+    push (@{$gene_obj->{secondary_ec_numbers}}, @ec_numbers);
+}
+
+####
+sub to_alignment_GFF3_format {
+    my ($gene_obj, $id, $target, $source) = @_;
+
+    unless (defined $source) {
+        $source = ".";
+    }
+
+    ## Note, only examines gene_obj and doesn't go deeper into alt-splicing layers, ... send isoforms in as separate objs.
+
+    unless ( (ref $gene_obj)  && defined($id) && defined($target)) {
+        croak "Error, need gene_obj, id, and target names as params";
+    }
+
+    my $gff3_alignment_text = "";
+        
+    my $orient = $gene_obj->get_orientation();
+    my $scaff = $gene_obj->{asmbl_id};
+    
+    my @exons = sort {$a->{end5}<=>$b->{end5}} $gene_obj->get_exons();
+    
+    if ($orient eq '-') {
+        @exons = reverse @exons;
+    }
+    
+
+    my $match_lend = 0;
+
+    foreach my $exon (@exons) {
+        
+        my ($lend, $rend) = sort {$a<=>$b} $exon->get_coords();
+        
+        my $m_lend = $match_lend + 1;
+        my $m_rend = $match_lend + ($rend - $lend + 1);
+        
+
+        $gff3_alignment_text .= join("\t", $scaff, $source, "match", $lend, $rend, "100", $orient, '.', # giving everything 100% identity since genome-based 
+                                     "ID=$id;Target=$target $m_lend $m_rend +") . "\n";
+        
+        
+        $match_lend = $m_rend;
+        
+        
+    }
+    
+    return($gff3_alignment_text);
+}
+
+
+
+
+####
+sub to_transcript_GTF_format {
+	my ($gene_obj) = @_;
+
+	## no worries about protein-coding regions.  Only report transcripts and exons tied to a particular gene.
+	## used with cufflinks package for computing FPKM values
+
+	my $gtf_text = "";
+
+	foreach my $gene ($gene_obj, $gene_obj->get_additional_isoforms()) {
+		
+		my $gene_id = $gene->{TU_feat_name} || "";
+		my $transcript_id = $gene->{Model_feat_name} || "";
+		my $asmbl_id = $gene_obj->{asmbl_id};
+		my ($lend, $rend) = sort {$a<=>$b} $gene_obj->get_transcript_span();
+		my $orientation = $gene_obj->get_orientation();
+        
+        my $com_name = $gene_obj->{com_name} || "";
+        $com_name =~ s/;/_/g;
+        $com_name =~ s/\"//g;
+        
+        
+        if ($gene->{gene_type} eq "protein-coding") {
+            my @exons = $gene->get_exons();
+            
+            $gtf_text .= join("\t", $asmbl_id, ".", "transcript", $lend, $rend, ".", $orientation, ".", 
+                              "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; name \"$com_name\";") . "\n";
+            
+            foreach my $exon (@exons) {
+                my ($lend, $rend) = sort {$a<=>$b} $exon->get_coords();
+                
+                $gtf_text .= join("\t", $asmbl_id, ".", "exon", $lend, $rend, ".", $orientation, ".", 
+                                  "gene_id \"$gene_id\"; transcript_id \"$transcript_id\";") . "\n";
+                
+            }
+            
+        }
+        else {
+            
+            ## non-protein-coding features
+            $gtf_text .= join("\t", $asmbl_id, ".", $gene->{gene_type}, $lend, $rend, ".", $orientation, ".", 
+                                  "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; name \"$com_name\";") . "\n";
+            
+            
+        }
+        
+        $gtf_text .= "\n";
+    }
+	
+	
+	return($gtf_text);
+}
+
+
+
+=over 4
+
+=item to_GTF_format()
+
+B<Description:> Outputs text corresponding to the representation of the gene in GTF format.
+
+B<Parameters:> $genome_seq_ref,  %preferences
+
+B<Returns:> string
+
+
+GTF format is described in "Current Protocols in Bioinformatics(2003)" 4.8.1-4.8.19
+in "Using TWINSCAN to Predict Gene Structures in Genomic DNA Sequences".
+
+Each line of the GTF format includes the following tab-delimited fields:
+
+[seqname] [source] [feature] [start] [end] [score] [strand] [frame] [attributes]
+
+This is further elaborated below:
+
+[feature] contains one of the following: start_codon, stop_codon, CDS
+[attributes] contains 'gene_id' and 'transcript_id' fields.  All features of the same transcript should share the same transcript_id value.  By default, the TU_feat_name and model_feat_name are used as the gene_id and transcript_id, respectively.
+
+
+    Using the %preferences input parameter, the preferred values or gene attributes can be used for seqname, source, gene_id, or transcript_id, each used as a key to the %preferences hash.  Given the value of %preferences is a gene attribute, that attribute value will be used, otherwise, the raw value will be used.
+
+For example:  %preferences = ( seqname => 'mySeqname',
+                               gene_id => 'pub_locus' );
+
+Would result in 'mySeqname' used in the [seqname] field, and the $gene_obj->{pub_locus} value  
+
+Here are the defaults:
+[seqname] = asmbl_id
+[source] = annotation
+gene_id (TU_feat_name)
+transcript_id (Model_feat_name)
+
+** Partial Genes are NOT Supported **  ( undef is returned )
+** Genes with split start or stop codons are unsupported ** (undef is returned)
+
+=back
+
+=cut
+
+    ;
+
+sub to_GTF_format {
+    my $gene_obj = shift;
+    my ($genome_seq_ref, %preferences) = @_;
+    
+    unless (ref $genome_seq_ref) { 
+        confess "Error, need genome seq reference as param";
+    }
+    
+    my $is_pseudogene = $gene_obj->is_pseudogene();
+    
+
+    my $TU_feat_name = $gene_obj->{TU_feat_name};
+    my $model_feat_name = $gene_obj->{Model_feat_name};
+    
+    # rid whitespace in identifiers
+    $TU_feat_name =~ s/\s+/_/g;
+    $model_feat_name =~ s/\s+/_/g;
+    
+    my $seqname = $preferences{seqname} || $gene_obj->{asmbl_id};
+    my $source = $preferences{source} || $gene_obj->{source} || ".";
+    
+    my $gene_id;
+    if (my $token = $preferences{gene_id}) {
+        $gene_id = $gene_obj->{$token};
+    } else {
+        $gene_id = $TU_feat_name;
+    }
+    
+    my $transcript_id;
+    if (my $token = $preferences{model_id}) {
+        $transcript_id = $gene_obj->{$token};
+    } else {
+        $transcript_id = $model_feat_name;
+    }
+        
+    my @exons = $gene_obj->get_exons();
+    my $orientation = $gene_obj->get_orientation();    
+    my @gtf_text;
+    
+    my $gene_obj_for_gtf = $gene_obj;  #if got stop codon, will need to strip it off.
+	my $com_name = $gene_obj->{com_name};
+	$com_name =~ s/\s+$// if $com_name;
+	$com_name =~ s/[\"\']//g if $com_name;
+	
+	my $name_txt = ($com_name) ? "Name \"$com_name\";" : "";
+	
+    unless ($is_pseudogene) {
+        $gene_obj->set_CDS_phases($genome_seq_ref);
+    
+        
+        ## check for start and stop codons.
+        my $cds_seq = uc $gene_obj->create_CDS_sequence($genome_seq_ref);
+        my @stop_codons = &Nuc_translator::get_stop_codons();
+        
+        my $first_CDS_segment = $gene_obj->get_first_CDS_segment();
+        my $first_phase = $first_CDS_segment->get_phase();
+        my $cds_is_integral_codon_num = (length($cds_seq) % 3 == 0) ? 1 : 0;
+        
+        ## examine start codon:
+        my $init_codon = substr($cds_seq, 0, 3);
+        if ($first_phase == 0 && $init_codon eq 'ATG') { # got start codon.
+            my @start_coordsets = $gene_obj->get_start_codon_coordinates();
+            foreach my $start_pair (@start_coordsets) {
+                my ($start_lend, $start_rend) = sort {$a<=>$b} @$start_pair;
+                push (@gtf_text, [$seqname,
+                                  $source,
+                                  "start_codon",
+                                  $start_lend,
+                                  $start_rend,
+                                  "0",
+                                  $orientation,
+                                  "0",
+                                  "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"]);
+            }
+        }
+        
+        my $candidate_stop_codon = uc substr($cds_seq, length($cds_seq) - 3, 3);
+        my @found_stop = grep { $_ eq $candidate_stop_codon } @stop_codons;
+        
+        if (@found_stop) {
+            # got a stop codon.
+            # check to see that the stop codon is in-frame.
+            if ((length($cds_seq) - $first_phase) % 3 == 0) { # yes, stop is in frame.
+                
+                my @stop_codon_coords = $gene_obj->get_stop_codon_coords();
+                foreach my $stop_pair (@stop_codon_coords) {
+                    my ($stop_lend, $stop_rend) = sort {$a<=>$b} @$stop_pair;
+                    
+                    push (@gtf_text, [$seqname,
+                                      $source,
+                                      "stop_codon",
+                                      $stop_lend,
+                                      $stop_rend,
+                                      "0",
+                                      $orientation,
+                                      "0",
+                                      "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"]);
+                }
+                
+                $gene_obj_for_gtf = $gene_obj->clone_gene();
+                
+                $gene_obj_for_gtf->trim_stop_codon();
+                
+            }
+        }
+    
+    }
+    
+    ## report the CDS regions:
+    foreach my $exon ($gene_obj_for_gtf->get_exons()) {
+        
+        my $cds = ($is_pseudogene) ? $exon : $exon->get_CDS_exon_obj();
+        
+        if ($cds) {
+            my $phase = ".";
+            unless ($is_pseudogene) {
+                $phase = $cds->get_phase();
+                if ($phase) {
+                    $phase = ($phase == 1) ? 2 : 1; # reverse it according to GFF3 vs. GTF representation.
+                }
+            }
+            
+            my ($cds_lend, $cds_rend) = sort {$a<=>$b} $cds->get_coords();
+         
+            push (@gtf_text, [$seqname,
+                              $source,
+                              "CDS",
+                              $cds_lend,
+                              $cds_rend,
+                              "0",
+                              $orientation,
+                              "$phase",
+                              "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"]);
+        }
+        
+            
+    }
+    
+    unless ($is_pseudogene) {
+
+        ## Get UTR info:
+        {
+            for my $pair ($gene_obj->get_3prime_UTR_coords) {
+                my ($lend,$rend) = sort {$a<=>$b} @$pair;
+                push (@gtf_text,   [$seqname,
+                                    $source,
+                                    "3UTR",
+                                    $lend,
+                                    $rend,
+                                    "0",
+                                    $orientation,
+                                    "0",
+                                    "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"] );
+            }
+            for my $pair ($gene_obj->get_5prime_UTR_coords) {
+                my ($lend,$rend) = sort {$a<=>$b} @$pair;
+                push (@gtf_text,   [$seqname,
+                                    $source,
+                                    "5UTR",
+                                    $lend, 
+                                    $rend,
+                                    "0",
+                                    $orientation,
+                                    "0",
+                                    "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt" ] );
+            }
+        }
+        
+    }
+    
+    @gtf_text = sort {$a->[3] <=> $b->[3]} @gtf_text;
+    
+    if ($orientation eq '-') {
+        @gtf_text = reverse @gtf_text;
+    }
+        
+    my $GTF = "";
+    foreach my $gtf_row (@gtf_text) {
+        $GTF .= join ("\t", @$gtf_row) . "\n";
+    }
+    
+    foreach my $isoform ($gene_obj->get_additional_isoforms()) {
+        $GTF .= "\n" . $isoform->to_GTF_format($genome_seq_ref, %preferences);
+    }
+    
+    return ($GTF);
+}
+
+
+
+
+
+####
+sub get_start_codon_coordinates {
+    my $gene_obj = shift;
+    
+    my $orient = $gene_obj->get_orientation();
+    
+    ## just want the coordinate pairs that define the first three CDS bases.
+    
+    my @cds_coords;
+    foreach my $exon ($gene_obj->get_exons()) {
+        if (my $cds = $exon->get_CDS_exon_obj()) {
+            my ($cds_end5, $cds_end3) = $cds->get_coords();
+            push (@cds_coords, [$cds_end5, $cds_end3]);
+        }
+    }
+    
+    my @start_coords;
+    my $start_len_want = 3;
+    foreach my $cds_coordpair (@cds_coords) {
+        my ($cds_end5, $cds_end3) = @$cds_coordpair;
+        my $cds_seg_len = abs ($cds_end3 - $cds_end5) + 1;
+        
+        my $extract_len = ($cds_seg_len < $start_len_want) ? $cds_seg_len : $start_len_want;
+        if ($orient eq '+') {
+            push (@start_coords, [$cds_end5, $cds_end5 + $extract_len - 1]);
+        }
+        else {
+            push (@start_coords, [$cds_end5, $cds_end5 - $extract_len + 1]);
+        }
+        $start_len_want -= $extract_len;
+        
+        if ($start_len_want <= 0) { last; }
+    }
+    
+    if ($start_len_want > 0) { 
+        confess "Error, trouble extracting start codon coordinates from cds coordsets: " . Dumper (\@cds_coords);
+    }
+    
+    return (@start_coords);
+}
+
+
+
+
+####
+sub get_stop_codon_coords {
+    my $gene_obj = shift;
+    
+    my $orient = $gene_obj->get_orientation();
+    
+    ## just want the coordinate pairs that define the last three CDS bases.
+    
+    my @cds_coords;
+    foreach my $exon (reverse $gene_obj->get_exons()) {
+        if (my $cds = $exon->get_CDS_exon_obj()) {
+            my ($cds_end5, $cds_end3) = $cds->get_coords();
+            push (@cds_coords, [$cds_end5, $cds_end3]);
+        }
+    }
+    
+    my @stop_coords;
+    my $stop_len_want = 3;
+    foreach my $cds_coordpair (@cds_coords) {
+        my ($cds_end5, $cds_end3) = @$cds_coordpair;
+        my $cds_seg_len = abs ($cds_end3 - $cds_end5) + 1;
+        
+        my $extract_len = ($cds_seg_len < $stop_len_want) ? $cds_seg_len : $stop_len_want;
+        if ($orient eq '+') {
+            push (@stop_coords, [$cds_end3 - $extract_len + 1, $cds_end3]);
+        }
+        else {
+            push (@stop_coords, [$cds_end3, $cds_end3 + $extract_len - 1]);
+        }
+        $stop_len_want -= $extract_len;
+        
+        if ($stop_len_want <= 0) { last; }
+    }
+    
+    if ($stop_len_want > 0) { 
+        confess "Error, trouble extracting stop codon coordinates from cds coordsets: " . Dumper (\@cds_coords);
+    }
+    
+    
+    return (@stop_coords);
+    
+}
+
+
+####
+sub trim_stop_codon {
+    my $gene_obj = shift;
+    
+    ## just trimming the last three bases from the CDS's, changing the current gene object.
+    
+    my @exons = reverse $gene_obj->get_exons();
+    
+    my $orient = $gene_obj->get_orientation();
+    
+    my $stop_len_want = 3;
+    foreach my $exon (@exons) {
+        if (my $cds = $exon->get_CDS_exon_obj()) {
+            
+            my ($cds_end5, $cds_end3) = $cds->get_coords();
+            my $cds_seg_len = abs ($cds_end3 - $cds_end5) + 1;
+            
+            my $extract_len = ($cds_seg_len < $stop_len_want) ? $cds_seg_len : $stop_len_want;
+            
+            if ($cds_seg_len == $extract_len) {
+                # delete it!
+                $exon->delete_CDS_exon_obj();
+            }
+            else {
+                ## truncate it by extract_len
+                if ($orient eq '+') {
+                    $cds->{end3} -= $extract_len;
+                }
+                
+                else {
+                    $cds->{end3} += $extract_len;
+                }
+            }
+            $stop_len_want -= $extract_len;
+            
+            if ($stop_len_want <= 0) { last; }
+        }
+    }
+    if ($stop_len_want > 0) { 
+        confess "Error, trouble extracting all stop codon coordinates from cds coordsets. " . $gene_obj->toString();
+    }
+    
+    return;
+    
+}
+
+
+
+=over 4
+    
+=item to_GFF3_format()
+
+B<Description:> Outputs text corresponding to the representation of the gene in GFF3 format (still under development).
+
+B<Parameters:> 
+
+B<Returns:> string
+
+GFF3 defined at:
+http://song.sourceforge.net/gff3-jan04.shtml
+
+(some text lifted from above site provided below for reference purposes)
+
+The format consists of 9 columns, separated by tabs or spaces.  The
+following unescaped characters are allowed within fields:
+[a-zA-Z0-9.:^*$@!+_?-].  All other characters must must be escaped
+using the URL escaping conventions.  Unescaped quotation marks,
+backslashes and other ad-hoc escaping conventions that have been added
+to the GFF format are explicitly forbidden.  The =, ; and % characters
+have reserved meanings as described below, and must be escaped when
+used in other contexts.
+
+Undefined fields are replaced with the "." character, as described in
+the original GFF spec.
+
+Column 1: "seqid"
+
+The ID of the landmark used to establish the coordinate system for the
+current feature.  IDs must contain alphanumeric characters.
+Whitespace, if present, must be escaped using URL escaping rule
+(e.g. space="%20" or "+").  Sequences must *NOT* begin with an
+unescaped ">".
+
+Column 2: "source"
+
+The source of the feature.  This is unchanged from the older GFF specs
+and is not part of a controlled vocabulary.
+
+Column 3: "type"
+
+The type of the feature (previously called the "method").  This is
+constrained to be either: (a) a term from the "lite" sequence
+ontology, SOFA; or (b) a SOFA accession number.  The latter
+alternative is distinguished using the syntax SO:000000.
+
+Columns 4 & 5: "start" and "end"
+
+The start and end of the feature, in 1-based integer coordinates,
+relative to the landmark given in column 1.  Start is always less than
+or equal to end.
+
+For zero-length features, such as insertion sites, start equals end
+and the implied site is to the right of the indicated base.  This
+convention holds regardless of the strandedness of the feature.
+
+Column 6: "score"
+
+The score of the feature, a floating point number.  As in earlier
+versions of the format, the semantics of the score are ill-defined.
+It is strongly recommended that E-values be used for sequence
+similarity features, and that P-values be used for ab initio gene
+prediction features.
+
+Column 7: "strand"
+
+The strand of the feature.  + for positive strand (relative to the
+landmark), - for minus strand, and . for features that are not
+stranded.  In addition, ? can be used for features whose strandedness
+is relevant, but unknown.
+
+Column 8: "phase"
+
+For features of type "exon", the phase indicates where the feature
+begins with reference to the reading frame.  The phase is one of the
+integers 0, 1,or 2, indicating that the first base of the feature
+corresponds to the first, second or last base of the codon,
+respectively.  This is NOT to be confused with the frame, but relates
+to the relative position of the translational start in whatever strand
+the feature is in.
+
+Column 9: "attributes"
+
+A list of feature attributes in the format tag=value.  Multiple
+tag=value pairs are separated by semicolons.  URL escaping rules are
+used for tags or values containing the following characters: ",=;".
+Whitespace should be replaced with the "+" character or the %20 URL
+escape.  This will allow the file to survive text processing programs
+that convert tabs into spaces.
+
+These tags have predefined meanings:
+
+    ID	   Indicates the name of the feature.  IDs must be unique
+	   within the scope of the GFF file.
+
+    Name   Display name for the feature.  This is the name to be
+           displayed to the user.  Unlike IDs, there is no requirement
+	   that the Name be unique within the file.
+
+    Alias  A secondary name for the feature.  It is suggested that
+	   this tag be used whenever a secondary identifier for the
+	   feature is needed, such as locus names and
+	   accession numbers.  Unlike ID, there is no requirement
+	   that Alias be unique within the file.
+
+    Parent Indicates the parent of the feature.  A parent ID can be
+	   used to group exons into transcripts, transcripts into
+	   genes, an so forth.  A feature may have multiple parents.
+
+    Target Indicates the target of a nucleotide-to-nucleotide or
+	   protein-to-nucleotide alignment.  The format of the
+	   value is "target_id+start+end".
+
+    Gap    The alignment of the feature to the target if the two are
+          not colinear (e.g. contain gaps).  The alignment format is
+	  taken from the CIGAR format described in the
+	  Exonerate documentation.
+	  (http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/exonerate
+           ?cvsroot=Ensembl).  See "THE GAP ATTRIBUTE" for a description
+	   of this format.
+
+    Note   A free text note.
+
+    Dbxref A database cross reference.  See the section
+	   "Ontology Associations and Db Cross References" for
+	   details on the format.
+
+    Ontology_term  A cross reference to an ontology term.  See
+           the section "Ontology Associations and Db Cross References"
+	   for details.
+
+Multiple attributes of the same type are indicated by separating the
+values with the comma "," character, as in:
+
+       Parent=AF2312,AB2812,abc-3
+
+Note that attribute names are case sensitive.  "Parent" is not the
+same as "parent".
+
+All attributes that begin with an uppercase letter are reserved for
+later use.  Attributes that begin with a lowercase letter can be used
+freely by applications.
+
+
+
+=back
+
+=cut
+
+    ;
+
+
+
+sub to_GFF3_format {
+    my ($gene_obj, %preferences) = @_;
+    
+    my $gene_id = $gene_obj->{TU_feat_name};
+   
+    my $strand = $gene_obj->get_orientation();
+    
+    my @noteText;
+    
+    if ($gene_obj->{is_pseudogene}) {
+        push (@noteText, "(pseudogene)");
+    }
+    
+    ## parse preferences
+    my $asmbl_id = $preferences{seqid} || $gene_obj->{asmbl_id};
+    my $source = $preferences{source} || $gene_obj->{source} || ".";
+    
+    unless ($asmbl_id) {
+        if ($gene_id =~ /^(\d+)/) {
+            $asmbl_id = $1;
+        } else {
+            die "Error, no asmbl_id from gene_obj\n";
+        }
+    }
+    
+    my ($gene_lend, $gene_rend) = sort {$a<=>$b} $gene_obj->get_gene_span();
+    my $com_name = $gene_obj->{com_name};
+	unless ($com_name =~ /\w/) {
+		$com_name = "";
+	}
+	
+	if ($com_name) {
+		# uri escape it:
+		$com_name = uri_escape($com_name);
+	}
+
+	my $gene_alias = "";
+	if (my $pub_locus = $gene_obj->{pub_locus}) {
+		$gene_alias = "Alias=$pub_locus;";
+	}
+       
+    my $feat_type = ($gene_obj->{gene_type} eq "protein-coding") ? "gene" : $gene_obj->{gene_type};
+    
+
+    my $gff3_text = "$asmbl_id\t$source\t$feat_type\t$gene_lend\t$gene_rend\t.\t$strand\t.\tID=$gene_id;Name=$com_name;$gene_alias\n";  ## note, non-coding gene features are currently represented by a simple single coordinate pair.
+    
+    if ($gene_obj->{gene_type} eq "protein-coding")  {
+		
+        my $gene_obj_ref = $gene_obj;
+        
+        foreach my $gene_obj ($gene_obj_ref, $gene_obj_ref->get_additional_isoforms() ) {
+            
+            my $model_id = $gene_obj->{Model_feat_name};
+            my $model_alias = "";
+            if (my $model_locus = $gene_obj->{Model_pub_locus}) {
+				$model_alias = "Alias=$model_locus;";
+			}
+			      
+			my ($mrna_lend, $mrna_rend) = $gene_obj->get_transcript_span();
+      
+            $gff3_text .= "$asmbl_id\t$source\tmRNA\t$mrna_lend\t$mrna_rend\t.\t$strand\t.\tID=$model_id;Parent=$gene_id;Name=$com_name;$model_alias\n";
+            
+            ## mark the first and last CDS entries (for now, an unpleasant hack!)
+            my @exons = $gene_obj->get_exons();
+            ## find the first cds
+            foreach my $exon (@exons) {
+                if (my $cds = $exon->get_CDS_obj()) {
+                    $cds->{first_cds} = 1;
+                    last;
+                }
+            }
+            @exons = reverse @exons;
+            foreach my $exon (@exons) {
+                if (my $cds = $exon->get_CDS_obj()) {
+                    $cds->{last_cds} = 1;
+                    last;
+                }
+            }
+            
+            my $prime5_partial = $gene_obj->is_5prime_partial();
+            my $prime3_partial = $gene_obj->is_3prime_partial();
+            
+            
+            ## annotate 5' utr
+            if ($gene_obj->has_CDS()) {
+                my @prime5_utr = $gene_obj->get_5prime_UTR_coords();
+                if (@prime5_utr) {
+                    my $utr_count = 0;
+                    foreach my $coordset (@prime5_utr) {
+                        my ($lend, $rend) = sort {$a<=>$b} @$coordset;
+                        $utr_count++;
+                        my $utr_id = "$model_id.utr5p$utr_count";
+                        $gff3_text .= "$asmbl_id\t$source\tfive_prime_UTR\t$lend\t$rend\t.\t$strand\t.\tID=$utr_id;Parent=$model_id\n";
+                    }
+                }
+            }
+            
+			
+			my $exon_counter = 0;
+            foreach my $exon ($gene_obj->get_exons()) {
+                $exon_counter++;
+				my ($exon_lend, $exon_rend) = sort {$a<=>$b} $exon->get_coords();
+                my $exon_ID_string = "";
+                if (my $exon_feat_name = $exon->{feat_name}) {
+                    $exon_ID_string = "$exon_feat_name";
+                }
+				else {
+					$exon_ID_string = "$model_id.exon$exon_counter";
+				}
+                $gff3_text .= "$asmbl_id\t$source\texon\t$exon_lend\t$exon_rend\t.\t$strand\t.\tID=${exon_ID_string};Parent=$model_id\n";
+
+                if (my $cds_obj = $exon->get_CDS_obj()) {
+                    my ($cds_lend, $cds_rend) = sort {$a<=>$b} $cds_obj->get_coords();
+                    my $phase = $cds_obj->{phase};
+					if (defined($phase)) {
+						## use GFF3 definition of phase, which is how many bases to trim before encountering first base of start
+						if ($phase == 2) { 
+							$phase = 1;
+						}
+						elsif ($phase == 1) {
+							$phase = 2;
+						}
+						# phase 0 remains 0
+					} 
+					else {
+						$phase =  "."; #use phase info if avail
+                    }
+
+					
+					my $cds_ID_string = "cds.$model_id";
+					
+					# according to the GFF3 spec, CDS segments from the same coding region should have the same identifier.
+					#if (my $cds_feat_name = $cds_obj->{feat_name}) {
+					#	$cds_ID_string = "$cds_feat_name";
+					#}
+					#else {
+					#	$cds_ID_string = "$model_id.cds$exon_counter";
+					#}
+					
+                    my $partial_text = "";
+                    if ($prime5_partial && $cds_obj->{first_cds}) {
+                        $partial_text .= ";5_prime_partial=true";
+                    }
+                    if ($prime3_partial && $cds_obj->{last_cds}) {
+                        $partial_text .= ";3_prime_partial=true";
+                    }
+                    
+                    $gff3_text .= "$asmbl_id\t$source\tCDS\t$cds_lend\t$cds_rend\t.\t$strand\t$phase\tID=${cds_ID_string};Parent=$model_id$partial_text\n";
+                }
+            }
+            
+            ## annotate 3' utr
+            if ($gene_obj->has_CDS()) {
+                my @prime3_utr = $gene_obj->get_3prime_UTR_coords();
+                if (@prime3_utr) {
+                    my $utr_count = 0;
+                    foreach my $coordset (@prime3_utr) {
+                        my ($lend, $rend) = sort {$a<=>$b} @$coordset;
+                        $utr_count++;
+                        my $utr_id = "$model_id.utr3p$utr_count";
+                        $gff3_text .= "$asmbl_id\t$source\tthree_prime_UTR\t$lend\t$rend\t.\t$strand\t.\tID=$utr_id;Parent=$model_id\n";
+                    }
+                }
+                
+            }
+        }
+        
+    }  ## end of protein-coding genes
+        
+
+	## strip off any trailing whitespace and semicolons:
+	my @lines = split (/\n/, $gff3_text);
+	foreach my $line (@lines) {
+		$line =~ s/\s+$//;
+		$line =~ s/;$//;
+	}
+	
+	$gff3_text = join ("\n", @lines) . "\n";
+	
+    return ($gff3_text);
+
+}
+
+
+
+=over 4
+
+=item to_BED_format()
+
+B<Description:> describes gene in BED format
+B<Parameters:> (uri_encode => 1|0)
+B<Returns:> string
+
+
+BED format described here:
+http://genome.ucsc.edu/FAQ/FAQformat.html#format1
+
+	BED format
+
+
+	
+
+BED format provides a flexible way to define the data lines that are displayed in an annotation track. BED lines have three required fields and nine additional optional fields. The number of fields per line must be consistent throughout any single set of data in an annotation track. The order of the optional fields is binding: lower-numbered fields must always be populated if higher-numbered fields are used. 
+
+The first three required BED fields are:
+
+1. chrom - The name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671). 
+
+2. chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0. 
+
+3. chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. 
+
+The 9 additional optional BED fields are:
+
+4. name - Defines the name of the BED line. This label is displayed to the left of the BED line in the Genome Browser window when the track is open to full display mode or directly to the left of the item in pack mode. 
+	
+5. score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). This table shows the Genome Browsers translation of BED score values into shades of gray
+
+6. strand - Defines the strand - either '+' or '-'. 
+
+7. thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays). 
+
+8. thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays). 
+
+9. itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser. 
+
+10. blockCount - The number of blocks (exons) in the BED line. 
+
+11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. 
+
+12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. 
+
+Example:
+Heres an example of an annotation track that uses a complete BED definition:
+track name=pairedReads description="Clone Paired Reads" useScore=1
+chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512
+chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601
+
+
+
+
+=cut
+
+
+sub to_BED_format {
+	my $self = shift;
+    my %params = @_;
+    
+	my $strand = $self->get_strand();
+
+	my ($coding_lend, $coding_rend) = sort {$a<=>$b} $self->get_CDS_span();
+	
+	my $scaffold = $self->{asmbl_id};
+
+	my $gene_id = $self->{TU_feat_name};
+	my $trans_id = $self->{Model_feat_name};
+	
+	my $com_name = $self->{com_name} || "";
+
+    my $score = $params{score} || 0;
+    
+	if (my $alias = $self->{pub_locus}) {
+		$com_name = "Alias=$alias;$com_name";
+	}
+	
+	
+    if ($gene_id) {
+        $com_name = "$gene_id;$com_name";
+    }
+    
+	if ($trans_id) {
+		$com_name = "ID=$trans_id;$com_name";
+	}
+	else {
+        $com_name = "ID=$com_name";
+    }
+    
+    if ($params{uri_encode}) {
+        $com_name = uri_escape($com_name);
+    }
+    
+
+	my @exons = sort {$a->{end5}<=>$b->{end5}} $self->get_exons();
+
+    my @exon_coords;
+	foreach my $exon (@exons) {
+        
+		my ($exon_lend, $exon_rend) = sort {$a<=>$b} $exon->get_coords();
+        push (@exon_coords, [$exon_lend, $exon_rend]);
+    }
+    
+	
+	my @starts;
+	my @lengths;
+
+    my $gene_lend = $exon_coords[0]->[0];
+    my $gene_rend = $exon_coords[$#exon_coords]->[1];
+    
+    foreach my $exon_coordset (@exon_coords) {
+        my ($exon_lend, $exon_rend) = @$exon_coordset;
+        
+		my $start = $exon_lend - $gene_lend;
+		push (@starts, $start);
+
+		my $length = $exon_rend - $exon_lend + 1;
+		push (@lengths, $length);
+	}
+		
+	
+	## construct bed output.
+
+	$com_name =~ s/ /_/g;
+	
+	my $bed_line = join("\t", $scaffold, 
+						$gene_lend-1, $gene_rend, 
+						$com_name, 
+						$score, 
+						$strand,
+						$coding_lend-1, $coding_rend,
+						"0", # rgb info - use '.' to allow user customization in IGV.   Need 0 for compatibility with UCSC browser.
+						scalar(@lengths),
+						join(",", @lengths),
+						join(",", @starts)
+						) . "\n";
+
+    foreach my $isoform ($self->get_additional_isoforms()) {
+        $bed_line .= $isoform->to_BED_format(%params);
+    }
+    
+	return($bed_line);
+}
+
+
+
+# static method, returns gene object.
+sub BED_line_to_gene_obj {
+	my ($bed_line) = @_;
+
+	if (ref $bed_line) {
+		confess "Error, static method, just provide bed text line, returns gene_obj";
+	}
+	
+	
+	my @x = split(/\t/, $bed_line);
+	
+	my $scaff = $x[0];
+	my $gene_lend = $x[1] + 1;
+	my $gene_rend = $x[2];
+
+	my $com_name = $x[3];
+
+	my $score = $x[4];
+	my $orient = $x[5];
+	
+	if ($orient eq '*') {
+		$orient = '+';
+	}
+	
+
+	my $coding_lend = $x[6] + 1;
+	my $coding_rend = $x[7];
+
+	my $rgb_color = $x[8];
+
+	my $num_exons = $x[9];
+
+	my $lengths_text = $x[10];
+	my $exon_relative_starts_text = $x[11];
+
+	my @lengths = split(/,/, $lengths_text);
+	my @exon_relative_starts = split(/,/, $exon_relative_starts_text);
+
+	my @exons;
+
+	while (@lengths) {
+		my $len = shift @lengths;
+		my $start = shift @exon_relative_starts;
+		
+		my $exon_lend = $gene_lend + $start;
+		my $exon_rend = $exon_lend + $len - 1;
+		
+
+		print "Len: $len, start=$start   ====>  $exon_lend - $exon_rend\n" if $DEBUG;
+
+		push (@exons, [$exon_lend, $exon_rend]);
+		
+	}
+
+	
+	print "Coding: $coding_lend-$coding_rend, Exons: " . Dumper (\@exons) if $DEBUG;
+	
+	my $gene_obj = new Gene_obj();
+	$gene_obj->build_gene_obj_exons_n_cds_range(\@exons, $coding_lend, $coding_rend, $orient);
+	
+	$gene_obj->{com_name} = $com_name;
+	$gene_obj->{asmbl_id} = $scaff;
+	
+	$com_name =~ s/\s+/\|/g; # reformat as an identifier with no whitespace
+	
+	$gene_obj->{TU_feat_name} = "$com_name";
+	$gene_obj->{Model_feat_name} = "m.$com_name";
+	
+	return($gene_obj);
+	
+		
+}
+
+
+
+
+
+
+## Private, remove leading and trailing whitespace characters:
+sub trim_leading_trailing_ws {
+    my ($ref) = @_;
+    if (ref $ref eq "SCALAR") {
+        $$ref =~ s/^\s+|\s+$//g;
+    } elsif (ref $ref eq "ARRAY") {
+        foreach my $element (@$ref) {
+            $element =~ s/^\s+|\s+$//g;
+        }
+    } else {
+        my $type = ref $ref;
+        die "Currently don't support trim_leading_trailing_ws(ref type: $type)\n";
+    }
+}
+
+
+
+=over 4
+
+=item to_GTF2_format()
+
+B<Description:> provides gene in GTF2 format
+
+B<Parameters:> genome_seq_ref, [properties_href]
+
+B<Returns:> text
+
+
+properties_href encodes preferences like so
+
+    properties_href = { 
+                          seqname => tigr_asmbl_id_1000, # by default, asmbl_id is used as encoded in gene_obj
+                          
+                          source => MyGenePrediction,   # by default, set to "TIGR"
+
+                          include_comments => 0,   # turned on by default, indicating partial or pseudogenes with preceding comment lines
+
+                      }
+
+
+
+The GTF2 format is described here:
+http://genes.cs.wustl.edu/GTF2.html
+
+as follows:
+
+GTF2 format (Revised Ensembl GTF)
+Gene transfer format. This borrows from GFF, but has additional structure that warrants a separate definition and format name.
+NEW! Validating Parser for GTF
+
+Structure is as GFF, so the fields are:
+<seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
+
+Here is a simple example with 3 translated exons. Order of rows is not important.
+
+AB000381 Twinscan  CDS          380   401   .   +   0  gene_id "001"; transcript_id "001.1";
+AB000381 Twinscan  CDS          501   650   .   +   2  gene_id "001"; transcript_id "001.1";
+AB000381 Twinscan  CDS          700   707   .   +   2  gene_id "001"; transcript_id "001.1";
+AB000381 Twinscan  start_codon  380   382   .   +   0  gene_id "001"; transcript_id "001.1";
+AB000381 Twinscan  stop_codon   708   710   .   +   0  gene_id "001"; transcript_id "001.1";
+
+The whitespace in this example is provided only for readability. In GTF, fields must be separated by a single TAB and no white space.
+
+<seqname>
+The FPC contig ID from the Golden Path.
+
+<source>
+The source column should be a unique label indicating where the annotations came from --- typically the name of either a prediction program or a public database.
+
+<feature>
+The following feature types are required: "CDS", "start_codon", "stop_codon". The feature "exon" is optional, since this project will not evaluate predicted splice sites outside of protein coding regions. All other features will be ignored.
+
+CDS represents the coding sequence starting with the first translated codon and proceeding to the last translated codon. Unlike Genbank annotation, the stop codon is not included in the CDS for the terminal exon.
+
+<start> <end>
+Integer start and end coordinates of the feature relative to the beginning of the sequence named in <seqname>.  <start> must be less than or equal to <end>. Sequence numbering starts at 1. Values of <start> and <end> that extend outside the reference sequence are technically acceptable, but they are discouraged for purposes of this project.
+
+<score>
+The score field will not be used for this project, so you can either provide a meaningful float or replace it by a dot.
+
+<frame>
+0 indicates that the first whole codon of the reading frame is located at 5'-most base. 1 means that there is one extra base before the first codon and 2 means that there are two extra bases before the first codon. Note that the frame is not the length of the CDS mod 3.
+
+Here are the details excised from the GFF spec. Important: Note comment on reverse strand.
+
+    '0' indicates that the specified region is in frame, i.e. that its first base corresponds to the first base of a codon. '1' indicates that there is one extra base, i.e. that the second base of the region corresponds to the first base of a codon, and '2' means that the third base of the region is the first base of a codon. If the strand is '-', then the first base of the region is value of <end>, because the corresponding coding region will run from <end> to <start> on the reverse strand.
+
+[attributes]
+All four features have the same two mandatory attributes at the end of the record:
+
+    * gene_id value;     A globally unique identifier for the genomic source of the transcript
+    * transcript_id value;     A globally unique identifier for the predicted transcript.
+
+These attributes are designed for handling multiple transcripts from the same genomic region. Any other attributes or comments must appear after these two and will be ignored.
+
+Attributes must end in a semicolon which must then be separated from the start of any subsequent attribute by exactly one space character (NOT a tab character).
+
+Textual attributes should be surrounded by doublequotes.
+
+Here is an example of a gene on the negative strand. Larger coordinates are 5' of smaller coordinates. Thus, the start codon is 3 bp with largest coordinates among all those bp that fall within the CDS regions. Similarly, the stop codon is the 3 bp with coordinates just less than the smallest coordinates within the CDS regions.
+
+AB000123    Twinscan     CDS    193817    194022    .    -    2    gene_id "AB000123.1"; transcript_id "AB00123.1.2";
+AB000123    Twinscan     CDS    199645    199752    .    -    2    gene_id "AB000123.1"; transcript_id "AB00123.1.2";
+AB000123    Twinscan     CDS    200369    200508    .    -    1    gene_id "AB000123.1"; transcript_id "AB00123.1.2";
+AB000123    Twinscan     CDS    215991    216028    .    -    0    gene_id "AB000123.1"; transcript_id "AB00123.1.2";
+AB000123    Twinscan     start_codon   216026    216028    .    -    .    gene_id    "AB000123.1"; transcript_id "AB00123.1.2";
+AB000123    Twinscan     stop_codon    193814    193816    .    -    .    gene_id    "AB000123.1"; transcript_id "AB00123.1.2";
+
+Note the frames of the coding exons. For example:
+
+   1. The first CDS (from 216028 to 215991) always has frame zero.
+   2. Frame of the 1st CDS =0, length =38.  (frame - length) % 3  = 1, the frame of the 2nd CDS.
+   3. Frame of the 2nd CDS=1, length=140. (frame - length) % 3  = 2, the frame of the 3rd CDS.
+   4. Frame of the 3rd CDS=2, length=108. (frame - length) % 3  =  2, the frame of the terminal CDS.
+   5. Alternatively, the frame of terminal CDS can be calculated without the rest of the gene. Length of the terminal CDS=206. length % 3 =2, the frame of the terminal CDS.
+
+Here is an example in which the "exon" feature is used. It is a 5 exon gene with 3 translated exons.
+
+AB000381 Twinscan  exon         150   200   .   +   .  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
+AB000381 Twinscan  exon         300   401   .   +   .  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
+AB000381 Twinscan  CDS          380   401   .   +   0  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
+AB000381 Twinscan  exon         501   650   .   +   .  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
+AB000381 Twinscan  CDS          501   650   .   +   2  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
+AB000381 Twinscan  exon         700   800   .   +   .  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
+AB000381 Twinscan  CDS          700   707   .   +   2  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
+AB000381 Twinscan  exon         900  1000   .   +   .  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
+AB000381 Twinscan  start_codon  380   382   .   +   0  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
+AB000381 Twinscan  stop_codon   708   710   .   +   0  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
+  
+
+
+
+
+=back
+
+=cut
+
+
+
+sub to_GTF2_format () {
+    my $self = shift;
+    my $genomic_seq_ref = shift;
+    
+    my $properties_href = shift;
+    unless ($properties_href) {
+        $properties_href = {};
+    }
+    
+
+    ## need to adjust my frame definition so it's consistent with requirements above in spec.
+    my $frame_convert = sub { 
+        my $phase = shift;
+        
+        my %frame = ( 0 => 0,
+                      1 => 2,
+                      2 => 1 );
+        return ($frame{$phase});
+    };
+    
+    
+    my $gtf2_text = "";
+    
+    my $gene_obj = $self;
+
+    my $asmbl_id = $properties_href->{seqname} || $gene_obj->{asmbl_id} || die "Error, no asmbl_id as gene_obj att";
+    
+    my $source = $properties_href->{source} || "TIGR";
+    
+    my $gene_id = $gene_obj->{TU_feat_name};
+    my $model_id = $gene_obj->{Model_feat_name};
+    my $strand = $gene_obj->get_orientation();
+    
+
+    my $comment_line = "";
+    if ($gene_obj->is_pseudogene()) {
+        $comment_line .= "$model_id=pseudogene ";
+    }
+    
+    if ( $gene_obj->{gene_type} eq "protein-coding") {
+        
+        if (! $gene_obj->is_pseudogene()) { 
+            
+            $gene_obj->set_CDS_phases($genomic_seq_ref);
+            # also resets the 5' and 3' partiality attributes based on the longest orf.
+            
+            
+            if ($gene_obj->is_5prime_partial()) {
+                $comment_line .= "$model_id=5'partial ";
+            }
+            else {
+                $gene_obj->validate_start_codon();
+            }
+            
+            if ($gene_obj->is_3prime_partial() ) {
+                $comment_line .= "$model_id=3'partial ";
+            }      
+            else {
+                $gene_obj->validate_stop_codon();
+            }
+        }
+        
+        my @stop_codon_objs;
+        my @start_codons;
+
+        if (! $gene_obj->is_pseudogene()) {
+            
+            if (! $gene_obj->is_3prime_partial())  {
+                @stop_codon_objs = $gene_obj->_remove_stop_codons();
+                
+                unless (@stop_codon_objs) {
+                    confess $gene_obj->toString() . "Error, no stop codon objs retrieved for non 3' partial gene";
+                }
+            }
+            if (! $gene_obj->is_5prime_partial()) {
+                @start_codons = $self->_extract_start_codons();
+                
+                unless (@start_codons) {
+                    confess $gene_obj->toString() . "Error, no start codon extracted for non 5'partial gene.";
+                }
+            }
+            
+        }
+        
+        foreach my $start_codon (@start_codons) {
+            my ($start_lend, $start_rend) = sort {$a<=>$b} $start_codon->get_coords();
+            my $phase = &$frame_convert($start_codon->{phase});
+            $gtf2_text .= "$asmbl_id\t$source\tstart_codon\t$start_lend\t$start_rend\t.\t$strand\t$phase\tgene_id \"$gene_id\"; transcript_id \"$model_id\";\n";
+        }
+        
+        
+        foreach my $exon ($gene_obj->get_exons()) {
+            my ($exon_lend, $exon_rend) = sort {$a<=>$b} $exon->get_coords();
+            $gtf2_text .= "$asmbl_id\t$source\texon\t$exon_lend\t$exon_rend\t.\t$strand\t.\tgene_id \"$gene_id\"; transcript_id \"$model_id\";\n";
+            
+            if ($gene_obj->is_pseudogene()) { next; } # don't bother trying to report nonsensical CDSs.
+            
+            if (my $cds_obj = $exon->get_CDS_obj()) {
+                my ($cds_lend, $cds_rend) = sort {$a<=>$b} $cds_obj->get_coords();
+                my $phase = $cds_obj->{phase};
+                unless (defined($phase)) {
+                    die "Error, no phase defined for cds($cds_lend-$cds_rend) of gene" . $gene_obj->toString();
+                }
+                $phase = &$frame_convert($phase);
+
+                $gtf2_text .= "$asmbl_id\t$source\tCDS\t$cds_lend\t$cds_rend\t.\t$strand\t$phase\tgene_id \"$gene_id\"; transcript_id \"$model_id\";\n";
+            }
+        }
+        
+        foreach my $stop_codon (@stop_codon_objs) {
+            my ($stop_lend, $stop_rend) = sort {$a<=>$b} $stop_codon->get_coords();
+            my $phase = &$frame_convert($stop_codon->{phase});
+            $gtf2_text .= "$asmbl_id\t$source\tstop_codon\t$stop_lend\t$stop_rend\t.\t$strand\t$phase\tgene_id \"$gene_id\"; transcript_id \"$model_id\";\n";
+        }
+        
+        foreach my $isoform ($gene_obj->get_additional_isoforms() ) {
+            $gtf2_text .= $isoform->to_GTF2_format($genomic_seq_ref, $properties_href);
+        }
+    }
+
+    if ($comment_line) {
+        # prefix with \# to actually comment it in the file
+        $comment_line = "#$comment_line\n";
+    }
+    
+    my $comment_flag = $properties_href->{include_comments};
+    if (defined ($comment_flag) && $comment_flag == 0) {
+        $comment_line = ""; # clear it
+    }
+    
+    
+    return ($comment_line . $gtf2_text);
+}
+
+
+sub _extract_start_codons {
+    my $self = shift;
+
+    ## 5' partiality attribute is trusted here !!!
+    
+    if ($self->is_5prime_partial()) {
+        return();
+    }
+
+    my @exons = $self->get_exons();
+    my $orientation = $self->get_orientation();
+    
+    my @start_codons;
+    
+    my $found_cds_flag = 0;
+    
+    for (my $i = 0; $i <= $#exons; $i++) {
+        if (my $cds = $exons[$i]->get_CDS_obj()) {
+            # found first cds
+            $found_cds_flag = 1;
+            my ($cds_end5, $cds_end3) = $cds->get_coords();
+            my $cds_len = $cds->length();
+            if ($cds_len >= 3) {
+                ## got start codon in entirety
+                if ($orientation eq '+') {
+                    push (@start_codons, CDS_exon_obj->new($cds_end5, $cds_end5+2)->set_phase(0));
+                    last;
+                } 
+                else {
+                    push (@start_codons, CDS_exon_obj->new($cds_end5, $cds_end5-2)->set_phase(0));
+                    last;
+                }
+            }
+            else {
+                ## split start codon
+                push (@start_codons, $cds); # add current cds as start codon part
+                my $missing_length = 3 - $cds_len;
+                
+                ## examine next cds exon for part of it:
+                my $next_cds = $exons[$i+1]->get_CDS_obj();
+                unless (ref $next_cds) {
+                    die "Error, no next cds for split start codon" . $self->toString();
+                }
+                
+                my ($next_cds_end5, $next_cds_end3) = $next_cds->get_coords();
+                my $next_cds_len = $next_cds->length();
+                
+                if ($next_cds_len >= $missing_length) {
+                    # great, this has everything we need
+                    if ($orientation eq '+') {
+                        push (@start_codons, 
+                              CDS_exon_obj->new($next_cds_end5, $next_cds_end5 + $missing_length-1)->set_phase($next_cds->{phase}));
+                        last;
+                    }
+                    else {
+                        push (@start_codons, 
+                              CDS_exon_obj->new($next_cds_end5, $next_cds_end5 - $missing_length + 1)->set_phase($next_cds->{phase}));
+                        last;
+                    }
+                }
+                else {
+                    ## another split start codon portion.  Just add the current cds, and get the first bp from the next cds
+                    push (@start_codons, $next_cds);
+                    
+                    my $final_cds = $exons[$i+2]->get_CDS_obj();
+                    unless (ref $final_cds) {
+                        die "Error getting final cds of three-part split start codon";
+                    }
+                    unless ($final_cds->{phase} == 2) {
+                        die "Error, final cds of three-part stop codon is not in phase 2 ";
+                    }
+                    my ($final_cds_end5, $final_cds_end3) = $final_cds->get_coords();
+                    push (@start_codons,
+                          CDS_exon_obj->new($final_cds_end5, $final_cds_end5)->set_phase(2));
+                    last;
+                }
+            } # end of split start codon
+            
+        } # end of found cds
+    } # end of foreach exon
+    
+    unless ($found_cds_flag) {
+        die "Error, no cds exon found in search of start codon";
+    }
+    
+    unless (@start_codons) {
+        die "Error, no start codons found";
+    }
+    ## ensure start codons sum to 3
+    my $sum_len = 0;
+    foreach my $start_codon (@start_codons) {
+        $sum_len += $start_codon->length();
+    }
+    unless ($sum_len == 3) {
+        print "Error, sum len of start codons != 3 ( = $sum_len, instead) " . $self->toString() . "starts:\n";
+        my $i=0;
+        foreach my $start (@start_codons) {
+            $i++;
+            print "start($i): " . $start->toString();
+        }
+        die;
+    }
+    
+    return (@start_codons);
+    
+}
+
+
+
+sub _remove_stop_codons {
+    my $self = shift;
+    
+    ## 3' partiality attribute is trusted here !!!
+    
+    if ($self->is_3prime_partial()) {
+        return ();
+    }
+    
+    my $orientation = $self->get_orientation();
+    my @exons = reverse $self->get_exons(); # examining exons in reverse order, starting from stop codon direction.
+    
+    my @stop_codons;
+    
+    my $found_cds_flag = 0;
+
+    ## find first exon
+    for (my $i=0; $i <= $#exons; $i++) {
+        if (my $cds = $exons[$i]->get_CDS_obj()) {
+            
+            $found_cds_flag = 1;
+            
+            my ($cds_end5, $cds_end3) = $cds->get_coords();
+            
+            my $cds_length = $cds->length();
+            if ($cds_length > 3) {
+                ## cds exon encodes more than just the stop codon
+                if ($orientation eq '+') {
+                    $cds->{end3} -= 3;
+                    push (@stop_codons, CDS_exon_obj->new($cds_end3 - 2, $cds_end3)->set_phase(0));
+                }
+                else {
+                    $cds->{end3} += 3;
+                    push (@stop_codons, CDS_exon_obj->new($cds_end3 + 2, $cds_end3)->set_phase(0));
+                }
+                last;
+                
+            }
+            elsif ($cds_length == 3) {
+                ## Just a stop codon exon.  We can remove it.
+                push (@stop_codons, $cds);
+                $exons[$i]->{CDS_exon_obj} = 0; # nullified
+                last;
+            }
+            
+            else {
+                ## cds exon encodes a split stop codon
+                push (@stop_codons, $cds); # just add the last portion of stop codon
+                $exons[$i]->{CDS_exon_obj} = 0; # nullified
+                
+                ## check next portion of cds exon to see if it contains the rest of the stop 
+                my $next_exon = $exons[$i+1];
+                unless (ref $next_exon) {
+                    die "Error, incomplete stop codon and not enough exons! ";
+                }
+                my $missing_stop_length = 3 - $cds_length;
+                my $next_cds_obj = $next_exon->get_CDS_obj();
+                unless (ref $next_cds_obj) {
+                    die "Error, next cds obj is missing!";
+                }
+                
+                my $next_cds_length = $next_cds_obj->length();
+                my ($cds_end5, $cds_end3) = $next_cds_obj->get_coords();
+                if ($next_cds_length <= $missing_stop_length) {
+                    ## encodes only the second part of the stop codon
+                    # add and nullify
+                    push (@stop_codons, $next_cds_obj);
+                    $next_exon->{CDS_exon_obj} = 0;
+                    
+                    ## get the very last part of the stop 
+                    $missing_stop_length -= $next_cds_length;
+                    if ($missing_stop_length > 0) {
+                        ## must be still missing the first bp of the stop codon
+                        if ($missing_stop_length != 1) {
+                            die "Error, too much of the stop codon is left (missing_length = $missing_stop_length).  Should only be 1 ";
+                        }
+                        my $next_exon = $exons[$i+2];
+                        unless (ref $next_exon) {
+                            die "Error, second next exon is unavail ";
+                        }
+                        my $next_cds_obj = $next_exon->get_CDS_obj();
+                        unless (ref $next_cds_obj) {
+                            die "Error, second next cds obj is unavail";
+                        }
+                        my $cds_length = $next_cds_obj->length();
+                        my ($cds_end5, $cds_end3) = $next_cds_obj->get_coords();
+                        if ($cds_length > 1) {
+                            if ($orientation eq '+') {
+                                $next_cds_obj->{end3}-=1;
+                                push (@stop_codons, CDS_exon_obj->new($cds_end3, $cds_end3)->set_phase(0));
+                            }
+                            else {
+                                $next_cds_obj->{end3}+=1;
+                                push (@stop_codons, CDS_exon_obj->new($cds_end3, $cds_end3)->set_phase(0));
+                            }
+                        }
+                    } 
+                }
+                else {
+                    # split stop codon
+                    #missing length of cds exon is present in the second portion of the stop 
+                    if ($orientation eq '+') {
+                        $next_cds_obj->{end3} -= $missing_stop_length;
+                        push (@stop_codons, CDS_exon_obj->new($cds_end3 - $missing_stop_length + 1, $cds_end3)->set_phase(0));
+                    }
+                    else {
+                        $next_cds_obj->{end3} += $missing_stop_length;
+                        push (@stop_codons, CDS_exon_obj->new($cds_end3 + $missing_stop_length -1, $cds_end3)->set_phase(0));
+                    }
+                }
+            } # end of split stop codon
+            
+            last;
+            
+        } # end of found cds obj
+        
+
+    } # end of foreach exon 
+    
+    unless ($found_cds_flag) {
+        die "Error, no cds exon was found. ";
+    }
+
+
+    unless (@stop_codons) {
+        die "Error, no stop codons extracted from non-partial gene.";
+    }
+
+    @stop_codons = reverse @stop_codons; # reorder according to gene direction
+
+    ## make sure sum (stop_codons) length == 3
+    my $sum_len = 0;
+    foreach my $stop_codon (@stop_codons) {
+        $sum_len += $stop_codon->length();
+    }
+    if ($sum_len != 3) {
+        print "Error, stop codons sum length != 3 ( = $sum_len, instead) " . $self->toString();
+        my $i=0;
+        foreach my $stop_codon (@stop_codons) {
+            print "stop($i): " . $stop_codon->toString();
+        }
+
+        die;
+    }
+    
+    return (@stop_codons);
+    
+}
+                
+
+
+sub has_CDS {
+    my $self = shift;
+
+    foreach my $exon ($self->get_exons()) {
+        if (ref ($exon->get_CDS_obj())) {
+            return (1);
+        }
+    }
+
+    return (0); # no cds entry found
+}
+
+
+
+sub set_CDS_phases {
+    my ($self, $genomic_seq_ref) = @_;
+        
+
+    my $start_pos = 1;
+    if ($self->has_CDS() && ! $self->is_pseudogene()) {
+     
+        $self->create_all_sequence_types($genomic_seq_ref);
+        
+        my $cds_sequence = $self->get_CDS_sequence();
+        my $protein_seq = $self->get_protein_sequence();
+        
+        ## first, clear the partial attributes:
+        $self->set_5prime_partial(0);
+        $self->set_3prime_partial(0);
+        
+        
+        if ($protein_seq !~ /^M/) {
+            # lacks start codon
+            $self->set_5prime_partial(1);
+        }
+        if ($protein_seq !~ /\*$/) {
+            # lacks stop codon
+            $self->set_3prime_partial(1);
+        }
+        
+        $start_pos = $self->_get_cds_start_pos($cds_sequence);
+        
+    
+        my $first_phase = $start_pos - 1;
+        
+        my @exons = $self->get_exons();
+        my @cds_objs;
+        foreach my $exon (@exons) {
+            my $cds = $exon->get_CDS_obj();
+            if (ref $cds) {
+                push (@cds_objs, $cds);
+            }
+        }
+        
+        my $cds_obj = shift @cds_objs;
+        $cds_obj->{phase} = $first_phase;
+        my $cds_length = abs ($cds_obj->{end3} - $cds_obj->{end5}) + 1;
+        $cds_length -= $first_phase;
+        
+        while (@cds_objs) {
+            my $next_cds_obj = shift @cds_objs;
+            $next_cds_obj->{phase} = $cds_length % 3;
+            $cds_length += abs ($next_cds_obj->{end3} - $next_cds_obj->{end5}) + 1;
+        }
+    }
+    
+    foreach my $isoform ($self->get_additional_isoforms()) {
+        $isoform->set_CDS_phases($genomic_seq_ref);
+    }
+    
+    return;
+
+}
+
+
+sub get_first_CDS_segment {
+    my $gene_obj = shift;
+    my @exons = $gene_obj->get_exons();
+    
+    foreach my $exon (@exons) {
+        if (my $cds = $exon->get_CDS_exon_obj()) {
+            return ($cds);
+        }
+    }
+
+    return undef;
+}
+
+sub _get_cds_start_pos {
+    my ($self, $cds_sequence) = @_;
+    my $cds_length = length($cds_sequence);
+    # if cds is set of triplets, assume translate at codon pos 1.
+    my $codon_start;
+    
+    ## must determine where translation starts:
+    my $new_orfFinder = new Longest_orf();
+    $new_orfFinder->allow_partials();
+    $new_orfFinder->forward_strand_only();
+    
+	my $longest_orf = $new_orfFinder->get_longest_orf($cds_sequence);
+   			
+	unless (ref $longest_orf) {
+		die "No longest ORF found in sequence";
+	}
+	
+	## examine the first three ORFs, prefer long orf with stop codon.
+    my $orfPos = $longest_orf->{start}; #init to first, longest orf.
+	
+	unless (defined $orfPos) {
+		die "Error, orfPos not defined! " . Dumper ($longest_orf);
+	}
+	
+	my $bestOrfPos;
+    my @allOrfs = $new_orfFinder->orfs();
+	    
+    for my $orfIndex (0..2) {
+        my $orf = $allOrfs[$orfIndex];
+        if ($orf) {
+            my $start = $orf->{start};
+            my $length = $orf->{length};
+            my $protein = $orf->{protein};
+            if ($length > $cds_length - 3 && $start <= 3 && $protein =~ /\*$/) {
+                unless ($bestOrfPos) {
+                    $bestOrfPos = $start;
+                }
+            }
+        }
+    }
+    
+    if ($bestOrfPos && $bestOrfPos != $orfPos) {
+        $orfPos = $bestOrfPos;
+    }
+    
+    if ($orfPos >3) {
+        confess "Error, longest ORF is found at position $orfPos, and should be between 1 and 3.  What's wrong with your gene?" . $self->toString();
+    }
+    $codon_start = $orfPos;
+    
+    return ($codon_start);
+}
+
+
+=over 4
+        
+=item dispose()
+
+B<Description:> Sets all attributes = 0, hopefully to faciliate targeting for garbage collection. (experimental method) 
+
+B<Parameters:> none
+
+B<Returns:> none
+
+=back
+
+=cut
+
+sub dispose {
+    my $self = shift;
+    foreach my $att (keys %$self) {
+	$self->{$att} = 0;
+    }
+}
+
+
+
+sub DESTROY {
+    my $self = shift;
+
+    warn "DESTROYING gene_obj: " . $self->{TU_feat_name} . "," . $self->{Model_feat_name} . "\n" if $main::DEBUG;
+
+}
+
+
+sub validate_start_codon {
+    ## requires that you have the CDS sequence already set
+    my $self = shift;
+
+    my $cds_sequence = $self->get_CDS_sequence() or confess "Error, cannot get CDS sequence.  It must be built prior to calling this method";
+    ## currently, only trust Met start codons.
+    my $start_codon = uc substr($cds_sequence, 0, 3);
+    if ($start_codon ne "ATG") {
+        die $self->toString() . "Error, start codon is not M (codon $start_codon instead)!";
+        # call within an eval block to catch exception
+    }
+}
+
+
+sub validate_stop_codon {
+    ## requires that you have the CDS sequence already set
+    my $self = shift;
+
+    my $cds_sequence = $self->get_CDS_sequence() or confess "Error, cannot get CDS sequence.  It must be built prior to calling this method";
+    
+    my @stop_codons = &Nuc_translator::get_stop_codons();
+    
+    my $curr_stop_codon = substr($cds_sequence, length($cds_sequence)-3, 3);
+    
+    my $found_stop_codon_flag = 0;
+    foreach my $stop (@stop_codons) {
+        if ($stop eq $curr_stop_codon) {
+            $found_stop_codon_flag = 1;
+            last;
+        }
+    }
+
+    unless ($found_stop_codon_flag) {
+        die $self->toString() . "Error, stop codon $curr_stop_codon is not an acceptable stop codon: [@stop_codons]\n";
+    }
+
+}
+
+
+
+
+######################################################################################################################################
+######################################################################################################################################
+
+
+=head1 NAME
+
+package mRNA_exon_obj
+
+=cut
+
+=head1 DESCRIPTION
+
+    The mRNA_exon_obj represents an individual spliced mRNA exon of a gene.  The coordinates of the exon can be manipulated, and the mRNA_exon_obj can contain a single CDS_exon_obj.  A mRNA_exon_obj lacking a CDS_exon_obj component is an untranslated (UTR) exon.
+
+    A mature Gene_obj is expected to have at least one mRNA_exon_obj component.
+
+=cut
+
+
+package mRNA_exon_obj;
+
+use strict;
+use warnings;
+use Storable qw (store retrieve freeze thaw dclone);
+
+=over 4
+
+=item new()
+
+B<Description:> Instantiates an mRNA_exon_obj
+
+B<Parameters:> <(end5, end3)>
+
+The end5 and end3 coordinates can be optionally passed into the constructor to set these attributes.  Alternatively, the set_coords() method can be used to set these values.
+
+B<Returns:> $mRNA_exon_obj
+
+=back
+
+=cut
+
+
+    ;
+
+sub new {
+    shift;
+    my $self = { end5 => 0,   # stores end5 of mRNA exon
+                 end3 => 0,   # stores end3 of mRNA exon
+                 CDS_exon_obj => 0,   # stores object reference to CDS_obj
+                 feat_name => 0,    # stores TIGR temp id
+                 strand => undef,   #   +|-
+                 };
+    
+    # end5 and end3 can be included as parameters in constructor.
+    if (@_) {
+        my ($end5, $end3) = @_;
+        if (defined($end5) && defined($end3)) {
+            $self->{end5} = $end5;
+            $self->{end3} = $end3;
+        }
+    }
+    
+    bless ($self);
+    return ($self);
+}
+
+
+
+=over 4
+
+=item get_CDS_obj()
+
+B<Description:> Retrieves the CDS_exon_obj component of this mRNA_exon_obj
+
+B<Parameters:> none
+
+B<Returns:> $cds_exon_obj
+
+If no CDS_exon_obj is attached, returns 0
+
+=back
+
+=cut
+
+    ;
+
+sub get_CDS_obj {
+    my $self = shift;
+    return ($self->{CDS_exon_obj});
+}
+
+
+## alias
+sub get_CDS_exon_obj {
+    my $self = shift;
+    return ($self->get_CDS_obj());
+}
+
+
+=over 4
+
+=item get_mRNA_exon_end5_end3()
+
+B<Description:> Retrieves the end5, end3 coordinates of the exon
+
+**Method Deprecated**, use get_coords()
+
+B<Parameters:> none
+
+B<Returns:> (end5, end3)
+
+=back
+
+=cut
+
+
+sub get_mRNA_exon_end5_end3 {
+    my $self = shift;
+    return ($self->{end5}, $self->{end3});
+}
+
+
+
+=over 4
+
+=item set_CDS_exon_obj()
+
+B<Description:> Sets the CDS_exon_obj of the mRNA_exon_obj
+
+B<Parameters:> $cds_exon_obj
+
+B<Returns:> none
+
+=back
+
+=cut
+
+    ;
+sub set_CDS_exon_obj {
+    my $self = shift;
+    my $ref = shift;
+    if (ref($ref)) {
+        $self->{CDS_exon_obj} = $ref;
+    }
+}
+
+
+
+####
+sub delete_CDS_exon_obj {
+    my $self = shift;
+    $self->{CDS_exon_obj} = undef;
+    return;
+}
+
+
+=over 4
+
+=item add_CDS_exon_obj()
+
+B<Description:> Instantiates and adds a new CDS_exon_obj to the mRNA_exon_obj given the CDS coordinates.
+
+B<Parameters:> (end5, end3)
+
+B<Returns:> none
+
+=back
+
+=cut
+
+
+sub add_CDS_exon_obj {
+    my $self = shift;
+    my ($end5, $end3) = @_;
+    my $cds_obj = CDS_exon_obj->new ($end5, $end3);
+    $self->set_CDS_exon_obj($cds_obj);
+}
+
+
+=over 4
+
+=item set_feat_name()
+
+B<Description:> Sets the feat_name attribute of the mRNA_exon_obj
+
+B<Parameters:> $feat_name
+
+B<Returns:> none
+
+=back
+
+=cut
+
+
+
+sub set_feat_name {
+    my $self = shift;
+    my $feat_name = shift;
+    $self->{feat_name} = $feat_name;
+}
+
+
+=over 4
+
+=item clone_exon()
+
+B<Description:> Creates a deep clone of this mRNA_exon_obj, using dclone() of Storable.pm
+
+B<Parameters:> none
+
+B<Returns:> $mRNA_exon_obj
+
+=back
+
+=cut
+    
+    
+
+sub clone_exon {
+    my $self = shift;
+  
+    my $clone_exon = dclone($self);
+        
+    return ($clone_exon);
+}
+
+
+
+=over 4
+
+=item get_CDS_end5_end3 ()
+
+B<Description:> Retrieves end5, end3 of the CDS_exon_obj component of this mRNA_exon_obj
+
+B<Parameters:> none
+
+B<Returns:> (end5, end3)
+
+An empty array is returned if no CDS_exon_obj is attached.
+
+=back
+
+=cut
+
+
+sub get_CDS_end5_end3 {
+    my $self = shift;
+    my $cds_obj = $self->get_CDS_obj();
+    if ($cds_obj) {
+        return ($cds_obj->get_CDS_end5_end3());
+    } else {
+        return ( () );
+    }
+}
+
+
+=over 4
+
+=item get_coords()
+
+B<Description:> Retrieves the end5, end3 coordinates of this mRNA_exon_obj
+
+B<Parameters:> none
+
+B<Returns:> (end5, end3)
+
+=back
+
+=cut
+
+
+sub get_coords {
+    my $self = shift;
+    return ($self->get_mRNA_exon_end5_end3());
+}
+
+
+=over 4
+
+=item set_coords()
+
+B<Description:> Sets the end5, end3 coordinates of the mRNA_exon_obj
+
+B<Parameters:> (end5, end3)
+
+B<Returns:> none
+
+=back
+
+=cut
+
+
+## simpler coord setting (end5, end3)
+sub set_coords {
+    my $self = shift;
+    my $end5 = shift;
+    my $end3 = shift;
+    $self->{end5} = $end5;
+    $self->{end3} = $end3;
+}
+
+
+=over 4
+
+=item get_strand()
+
+B<Description:> Retrieves the orientation of the mRNA_exon_obj based on gene models transcribed orientation.
+
+B<Parameters:> none
+
+B<Returns:> +|-|undef
+
+If end5 == end3, strand orientation cannot be inferred based on coordinates alone, so undef is returned.
+
+=back
+
+=cut
+
+
+    ;
+
+sub get_orientation {
+    # determine positive or reverse orientation
+    my $self = shift;
+    return ($self->{strand});
+}
+
+
+sub get_strand { ## preferred
+	my $self = shift;
+	return($self->get_orientation());
+}
+
+
+####
+sub merge_exon {
+    my $self = shift;
+    my $other_exon = shift;
+
+    my $cds = $self->get_CDS_exon_obj();
+    
+    my $other_cds = $other_exon->get_CDS_exon_obj();
+
+    if ($other_cds) {
+        if ($cds) {
+            $cds->merge_CDS($other_cds);
+        }
+        else {
+            # current exon lacks cds. Set this one to it.
+            $self->set_CDS_exon_obj($other_cds);
+        }
+    }
+
+
+    ## merge the exons.
+    my @coords = sort {$a<=>$b} ($self->get_coords(), $other_exon->get_coords());
+    my $lend = shift @coords;
+    my $rend = pop @coords;
+    
+    my ($new_end5, $new_end3) = ($self->get_orientation() eq '+') ? ($lend, $rend) : ($rend, $lend);
+
+    $self->set_coords($new_end5, $new_end3);
+
+    return;
+}
+    
+
+
+
+
+
+=over 4
+
+=item toString()
+
+B<Description:> Provides a textual description of the mRNA_exon_obj 
+
+B<Parameters:> none
+
+B<Returns:> $text
+
+=back
+
+=cut
+
+    ;
+
+
+sub toString {
+    my $self = shift;
+    my @coords = $self->get_mRNA_exon_end5_end3();
+    my $feat_name = $self->{feat_name};
+    my $text = "";
+    if ($feat_name) {
+        $text .= "feat_name: $feat_name\t";
+    }
+    $text .= "end5 " . $coords[0] . "\tend3 " . $coords[1] . "\n";
+    return ($text);
+}
+
+
+sub length {
+    my $self = shift;
+
+    my $len = abs ($self->{end5} - $self->{end3}) + 1;
+    
+    return($len);
+}
+
+
+
+
+
+##########################################################################################################################
+##########################################################################################################################
+
+
+
+=head1 NAME
+
+package CDS_exon_obj
+
+=cut
+
+
+=head1 DESCRIPTION
+
+    The CDS_exon_obj represents the protein-coding portion of an mRNA_exon_obj.
+
+=cut
+
+
+
+package CDS_exon_obj;
+
+use strict;
+use warnings;
+use Storable qw (store retrieve freeze thaw dclone);
+use Carp;
+
+
+=over 4
+
+=item new()
+
+B<Description:>  Cosntructor for the CDS_exon_obj
+
+B<Parameters:> <(end5, end3)>
+
+The (end5, end3) parameter is optional.  Alternatively, the set_coords() method can be used to set these values.
+
+B<Returns:> $cds_exon_obj
+
+=back
+
+=cut
+
+    ;
+
+sub new {
+    shift;
+    my $self = { end5 => 0,   #stores end5 of cds exon
+                 end3 => 0,    #stores end3 of cds exon
+                 phase => undef, #must set if to output in gff3 format.
+                 feat_name => 0, #tigr's temp id
+                 strand => undef,   # +|-
+             };
+    
+    
+    # end5 and end3 are allowed constructor parameters
+    if (@_) {
+        my ($end5, $end3) = @_;
+        if (defined ($end5) && defined ($end3)) {
+            $self->{end5} = $end5;
+            $self->{end3} = $end3;
+        }
+    }
+    bless ($self);
+    return ($self);
+}
+
+
+
+=over 4
+
+=item set_feat_name()
+
+B<Description:> Sets the feat_name attribute value of the CDS_exon_obj 
+
+B<Parameters:> $feat_name
+
+B<Returns:> none
+
+=back
+
+=cut
+
+
+sub set_feat_name {
+    my $self = shift;
+    my $feat_name = shift;
+    $self->{feat_name} = $feat_name;
+}
+
+
+=over 4
+
+=item get_CDS_end5_end3()
+
+B<Description:> Retrieves the end5, end3 coordinates of the CDS_exon_obj
+
+** Method deprecated **, use get_coords()
+
+
+B<Parameters:> none
+
+B<Returns:> (end5, end3)
+
+=back
+
+=cut
+
+
+sub get_CDS_end5_end3 {
+    my $self = shift;
+    return ($self->{end5}, $self->{end3});
+}
+
+
+
+=over 4
+
+=item set_coords()
+    
+B<Description:> Sets the (end5, end3) values of the CDS_exon_obj 
+
+B<Parameters:> (end5, end3)
+
+B<Returns:> none
+
+=back
+
+=cut
+
+
+
+sub set_coords {
+    my $self = shift;
+    my $end5 = shift;
+    my $end3 = shift;
+    $self->{end5} = $end5;
+    $self->{end3} = $end3;
+}
+
+=over 4
+
+=item get_coords()
+
+B<Description:> Retrieves the (end5, end3) coordinates of the CDS_exon_obj
+
+B<Parameters:> none
+
+B<Returns:> (end5, end3)
+
+
+The get_coords() method behaves similarly among Gene_obj, mRNA_exon_obj, and CDS_exon_obj, and is generally preferred to other existing methods for extracting these coordinate values.  Other methods persist for backwards compatibility with older applications, but have been largely deprecated.
+
+
+=back
+
+=cut
+
+
+
+sub get_coords {
+    my $self = shift;
+    return ($self->get_CDS_end5_end3());
+}
+
+
+=over 4
+
+=item get_orientation()
+
+B<Description:> Retrieves the orientation of the CDS_exon_obj based on gene models orientation.
+
+B<Parameters:> none
+
+B<Returns:> +|-|undef
+
+undef returned if end5 == end3
+
+=back
+
+=cut
+
+    ;
+
+sub get_orientation {
+    # determine positive or reverse orientation
+    my $self = shift;
+    return ($self->{strand});
+}
+
+
+sub get_strand { ## preferred
+	my $self = shift;
+	return($self->get_orientation());
+}
+
+
+=over 4
+
+=item toString()
+
+B<Description:> Retrieves a textual description of the CDS_exon_obj
+
+B<Parameters:> none
+
+B<Returns:> $text
+
+=back
+
+=cut
+
+
+
+=over 4
+
+=item clone_cds()
+
+B<Description:> Creates a deep clone of this CDS_exon_obj, using dclone() of Storable.pm
+
+B<Parameters:> none
+
+B<Returns:> $mRNA_exon_obj
+
+=back
+
+=cut
+    
+    
+
+sub clone_cds {
+    my $self = shift;
+  
+    my $clone_cds = dclone($self);
+        
+    return ($clone_cds);
+}
+
+
+=over 4
+
+=item length()
+
+B<Description:> length of this cds segment
+
+B<Parameters:> none
+
+B<Returns:> int
+
+=back
+
+=cut
+    
+
+sub length {
+    my $self = shift;
+    my $length = abs ($self->{end3} - $self->{end5}) + 1;
+    return ($length);
+}
+
+
+
+=over 4
+
+=item  set_phase()
+
+B<Description:> set phase of the CDS incident bp
+
+B<Parameters:> [012]
+
+B<Returns:> self
+
+
+phase 0 = first bp of codon
+phase 1 = second bp of codon
+phase 2 = third bp of codon
+
+
+=back
+
+=cut
+    
+
+sub set_phase {
+    my $self = shift;
+    my $phase = shift;
+    $self->{phase} = $phase;
+    return($self);
+}
+
+=over 4
+
+=item  get_phase()
+
+B<Description:> gets phase of the CDS incident bp
+
+B<Parameters:> none
+
+B<Returns:> [012] or undef if not set
+
+
+phase 0 = first bp of codon
+phase 1 = second bp of codon
+phase 2 = third bp of codon
+
+
+=back
+
+=cut
+    
+
+
+
+sub get_phase {
+    my $self = shift;
+    my $phase = $self->{phase};
+    return($phase);
+}
+
+
+
+####
+sub merge_CDS {
+    my $self = shift;
+    my $other_cds = shift;
+    
+    my $orientation = $self->get_orientation();
+    unless ($orientation) {
+        confess "Error, self CDS lacks orientation\n";
+    }
+    
+    my @coords = sort {$a<=>$b} ($self->get_coords(), $other_cds->get_coords());
+    my $lend = shift @coords;
+    my $rend = pop @coords;
+
+    unless ($lend && $rend) {
+        confess "Error, trying to merge CDSs but coordinates are not available: \n"
+            . "self: " . $self->toString()
+            . "\n"
+            . "other: " . $other_cds->toString() . "\n";
+    }
+    
+    my ($end5, $end3) = ($orientation eq '+') ? ($lend, $rend) : ($rend, $lend);
+    
+    $self->set_coords($end5, $end3);
+}
+
+sub toString {
+    my $self = shift;
+    my @coords = $self->get_CDS_end5_end3();
+    my $feat_name = $self->{feat_name};
+    my $text = "";
+    if ($feat_name) {
+        $text .= "feat_name: $feat_name\t";
+    }
+    $text .= "end5 " . $coords[0] . "\tend3 " . $coords[1] . "\n";
+    return ($text);
+}
+
+
+1;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/PerlLib/Gene_obj_indexer.pm b/PerlLib/Gene_obj_indexer.pm
new file mode 100755
index 0000000..c44f78b
--- /dev/null
+++ b/PerlLib/Gene_obj_indexer.pm
@@ -0,0 +1,75 @@
+#!/usr/local/bin/perl
+
+package Gene_obj_indexer;
+use strict;
+use warnings;
+use base qw(TiedHash);
+use Gene_obj;
+use Storable qw (thaw nfreeze);
+use Carp;
+
+
+####
+sub new {
+    my $packagename = shift;
+    
+    my $self = $packagename->SUPER::new(@_);
+ 
+    return ($self);
+   
+}
+
+####
+sub store_gene {
+    my ($self, $identifier, $gene_obj)  = @_;
+    
+
+    unless (ref $gene_obj) {
+        confess "Error, no gene_obj as param";
+    }
+    
+    my $blob = nfreeze ($gene_obj);
+    
+    my $success = 0;
+    
+    while (! $success) {
+        $self->store_key_value($identifier, $blob);
+        
+        eval {
+            my $gene_obj = $self->get_gene($identifier);
+            
+        };
+        if ($@) {
+            warn "error trying to store gene $identifier using berkeley db.  Trying again...\n";
+        }
+        else {
+            # worked.
+            $success = 1;
+        }
+    }
+    
+}
+
+
+####
+sub get_gene {
+    my $self = shift;
+    my $identifier = shift;
+
+    my $blob = $self->get_value($identifier);
+    
+    unless ($blob) {
+        confess "Error, no gene obj retrieved based on identifier $identifier";
+    }
+
+    my $gene_obj = thaw($blob);
+    unless (ref $gene_obj) {
+        confess "Error retrieving gene_obj based on identifier $identifier.  Data retrieved but not thawed properly.\n";
+    }
+    
+    return ($gene_obj);
+}
+
+
+1; #EOM
+
diff --git a/PerlLib/Longest_orf.pm b/PerlLib/Longest_orf.pm
new file mode 100755
index 0000000..61737c5
--- /dev/null
+++ b/PerlLib/Longest_orf.pm
@@ -0,0 +1,371 @@
+#!/usr/local/bin/perl
+
+package main;
+our $SEE; 
+
+package Longest_orf;
+
+use strict;
+use warnings;
+use Nuc_translator;
+use Carp;
+
+## if allow_partials is set, partial orfs are included in the analysis.  
+
+# below used to be static, now instance vars.
+#my $ALLOW_5PRIME_PARTIALS = 0; #allow for lacking start codon in logest orf.
+#my $ALLOW_3PRIME_PARTIALS = 0; #allow for lacking stop codon in longest orf.
+#my $FORWARD_STRAND = 1; #default set to true (analyze forward strand)
+#my $REVERSE_STRAND = 1; #default set to true.
+#my $ALLOW_NON_MET_STARTS = 0; #allow for non-methionine start codons.
+
+
+sub new {
+    shift;
+	
+    ## This object stores the longest ORF identified.
+    my @stop_codons = &Nuc_translator::get_stop_codons(); # live call, depends on current genetic code.
+    print "Stop codons in use: @stop_codons, set dynamically via current Nuc_translator settings.\n" if $SEE;
+    unless (@stop_codons) {
+	  confess "Fatal, no stop codons set";
+	}
+
+	my $obj = { pep_seq => undef,
+				nt_seq => undef,
+				length => undef, #length of nt_seq
+				end5 => undef,
+				end3 => undef,
+				all_ORFS=>[], #container holds all ORFs found in order of decreasing length. Use orfs() method to retrieve them.
+				stop_codons => [@stop_codons],
+				
+				## ORF settings
+				ALLOW_5PRIME_PARTIALS => 0,
+				ALLOW_3PRIME_PARTIALS => 0,
+				FORWARD_STRAND => 1,
+				REVERSE_STRAND => 1,
+				ALLOW_NON_MET_STARTS => 0
+					
+				};
+    bless ($obj);
+    return ($obj);
+}
+
+## can include partial orfs at end of sequence.
+sub allow_partials {
+    my $self = shift;
+    die unless (ref $self);
+    $self->{ALLOW_5PRIME_PARTIALS} = 1;
+    $self->{ALLOW_3PRIME_PARTIALS} = 1;
+	
+    if ($SEE) {
+		print "Longest_orf: allowing both 5' and 3' partials.\n";
+    }
+}
+
+sub allow_5prime_partials {
+    my $self = shift;
+    die unless (ref $self);
+    $self->{ALLOW_5PRIME_PARTIALS} = 1;
+    if ($SEE) {
+		print "Longest_orf: allowing 5prime partials.\n";
+    }
+}
+
+sub allow_3prime_partials {
+    my $self = shift;
+    die unless (ref $self);
+    $self->{ALLOW_3PRIME_PARTIALS} = 1;
+    if ($SEE) {
+		print "Longest_orf: allowing 3prime partials\n";
+    }
+}
+
+sub forward_strand_only {
+    my $self = shift;
+    die unless (ref $self);
+    $self->{REVERSE_STRAND} = 0;
+    if ($SEE) {
+		print "Longest_orf: forward strand only.\n";
+    }
+	
+}
+
+sub reverse_strand_only {
+    my $self = shift;
+    die unless (ref $self);
+    $self->{FORWARD_STRAND} = 0;
+    if ($SEE) {
+		print "Longest_orf: reverse strand only.\n";
+    }
+    
+}
+
+sub allow_non_met_starts {
+    my $self = shift;
+    $self->{ALLOW_NON_MET_STARTS} = 1;
+    if ($SEE) {
+		print "Longest_orf: allowing non Met start codons.\n";
+    }
+}
+
+
+sub get_longest_orf {
+    my $self = shift;
+    my $input_sequence = shift;
+    
+    unless ($input_sequence) {
+		print STDERR "I require a cDNA nucleotide sequence as my only parameter\n";
+		return;
+    }
+    unless (length ($input_sequence) >= 3) {
+		print STDERR "Sequence must code for at least a codon. Your seq_length is too short\n";
+		return;
+    }
+    my @orfList = $self->capture_all_ORFs($input_sequence);
+	# print "Found " . scalar @orfList . " orfs.\n";
+	if (@orfList) {
+		return ($orfList[0]); # longest ORF found is first in the sorted list.
+	}
+	else {
+		## no ORFs found
+		return (undef);
+	}
+}
+
+
+sub capture_all_ORFs {
+    
+    my $self = shift;
+    my $input_sequence = shift;
+
+    unless ($input_sequence) {
+		print STDERR "I require a cDNA nucleotide sequence as my only parameter\n";
+		return;
+    }
+    unless (length ($input_sequence) >= 3) {
+		print STDERR "Sequence must code for at least a codon. Your seq_length is too short\n";
+		return;
+    }
+    
+    $input_sequence = lc ($input_sequence);
+    
+    my (@starts, @stops, @orfs);
+
+    if ($self->{FORWARD_STRAND}) {
+		## analyse forward position
+		@stops = $self->identify_putative_stops($input_sequence);
+		@starts = $self->identify_putative_starts($input_sequence,\@stops);
+		@orfs = $self->get_orfs (\@starts, \@stops, $input_sequence, '+');
+    }
+    
+    if ($self->{REVERSE_STRAND}) {
+		## reverse complement sequence and do again
+		$input_sequence = &revcomp ($input_sequence);
+		@stops = $self->identify_putative_stops($input_sequence);
+		@starts = $self->identify_putative_starts($input_sequence, \@stops);
+		push (@orfs,  $self->get_orfs (\@starts, \@stops, $input_sequence, '-'));
+    }
+	
+    if (@orfs) {
+		## set in order of decreasing length
+		@orfs = reverse sort {$a->{length} <=> $b->{length}} @orfs;
+		
+		my $longest_orf = $orfs[0];
+		my $start = $longest_orf->{start};
+		my $stop = $longest_orf->{stop};
+		my $seq = $longest_orf->{sequence};
+		my $length = length($seq);
+		my $protein = &translate_sequence($seq, 1);
+		$self->{end5} = $start;  ## now coord is seq_based instead of array based.
+		$self->{end3} = $stop;
+		$self->{length} = $length;
+		$self->{nt_seq} = $seq;
+		$self->{pep_seq} = $protein;
+		$self->{all_ORFS} = \@orfs;
+	}
+
+	return (@orfs);
+}
+
+sub orfs {
+    my $self = shift;
+    return (@{$self->{all_ORFS}});
+}
+
+#####################
+# supporting methods
+#####################
+
+sub get_end5_end3 {
+    my $self = shift;
+    return ($self->{end5}, $self->{end3});
+}
+
+sub get_peptide_sequence {
+    my $self = shift;
+    return ($self->{pep_seq});
+}
+
+sub get_nucleotide_sequence {
+    my $self = shift;
+    return ($self->{nt_seq});
+}
+
+
+sub toString {
+    my $self = shift;
+    my ($end5, $end3) = $self->get_end5_end3();
+    my $protein = $self->get_peptide_sequence();
+    my $nt_seq = $self->get_nucleotide_sequence();
+    my $ret_string = "Coords: $end5, $end3\n" 
+	. "Protein: $protein\n"
+	    . "Nucleotides: $nt_seq\n";
+    return ($ret_string);
+}
+
+
+#################################
+
+#Private methods:
+
+
+sub get_orfs {
+    my ($self, $starts_ref, $stops_ref, $seq, $direction) = @_;
+    
+	unless ($starts_ref && $stops_ref && $seq && $direction) {
+		confess "Error, params not appropriate";
+	}
+	
+	my %last_delete_pos = ( 0=>-1,
+							1=>-1,
+							2=>-1); #store position of last chosen stop codon in spec reading frame.
+    my @orfs;
+    my $seq_length = length ($seq);
+	
+    if ($SEE) {
+		print "Potential Start codons: " . join (", ", @$starts_ref) . "\n";
+		print "Potential Stop codons: " . join (", ", @$stops_ref) . "\n";
+    }
+    
+    
+    foreach my $start_pos (@{$starts_ref}) {
+		my $start_pos_frame = $start_pos % 3;
+		foreach my $stop_pos (@{$stops_ref}) {
+		  # print "Comparing start: $start_pos to stop: $stop_pos, $direction\n";
+		  if ( ($stop_pos > $start_pos)   && #end3 > end5
+				 ( ($stop_pos - $start_pos) % 3 == 0) #must be in-frame
+				 && ($start_pos > $last_delete_pos{$start_pos_frame})) #only count each stop once.
+			{
+				
+				$last_delete_pos{$start_pos_frame} = $stop_pos;
+				my ($start_pos_adj, $stop_pos_adj) = ( ($start_pos+1), ($stop_pos+1+2));
+				#print "Startposadj: $start_pos_adj\tStopPosadj: $stop_pos_adj\n";
+				# sequence based position rather than array-based
+				
+				my ($start, $stop) = ($direction eq '+') ? ($start_pos_adj, $stop_pos_adj) 
+					: (&revcomp_coord($start_pos_adj, $seq_length), &revcomp_coord($stop_pos_adj, $seq_length));
+				
+				print "Retrieving ORF, Start: $start\tStop: $stop\n" if $SEE;
+				my $orfSeq =  substr ($seq, $start_pos, ($stop_pos - $start_pos + 3)); #include the stop codon too.
+				my $protein = &translate_sequence($orfSeq, 1);
+				if ($protein =~ /\*.*\*/) {
+				  confess "Fatal Error: Longest_orf: ORF returned which contains intervening stop(s): ($start-$stop, $direction\nProtein:\n$protein\nOf Nucleotide Seq:\n$seq\n";
+				}
+				my $orf = { sequence => $orfSeq,
+							protein => $protein,
+							start=>$start,
+							stop=>$stop,
+							length=>length($orfSeq),
+							orient=>$direction
+							};
+				push (@orfs, $orf);
+				last;
+			}
+		}
+    }
+    return (@orfs);
+}
+
+
+sub identify_putative_starts {
+    my ($self, $seq, $stops_aref) = @_;
+    my %starts;
+    my %stops;
+    foreach my $stop (@$stops_aref) {
+		$stops{$stop} = 1;
+    }
+	
+    if ($self->{ALLOW_5PRIME_PARTIALS} || $self->{ALLOW_NON_MET_STARTS}) {
+		$starts{0} = 1 unless $stops{0};
+		$starts{1} = 1 unless $stops{1};
+		$starts{2} = 1 unless $stops{2};
+    }
+    
+    if (! $self->{ALLOW_NON_MET_STARTS}) { #Look for ATG start codons.
+		my $start_pos = index ($seq, "atg");
+		while ($start_pos != -1) {
+			$starts{$start_pos} = 1;
+			#print "Start: $start_pos\n";
+			$start_pos = index ($seq, "atg", ($start_pos + 1));
+		}
+    } else {
+		# find all residues just subsequent to a stop codon, in-frame:
+		foreach my $stop (@$stops_aref) {
+			my $candidate_non_met_start = $stop +3;
+			unless ($stops{$candidate_non_met_start}) {
+				$starts{$candidate_non_met_start} = 1;
+			}
+		}
+    }
+    my @starts = sort {$a<=>$b} keys %starts;
+    return (@starts);
+}
+
+
+sub identify_putative_stops {
+  my ($self, $seq) = @_;
+  my %stops;
+  if ($self->{ALLOW_3PRIME_PARTIALS}) {
+	## count terminal 3 nts as possible ORF terminators.
+	my $seq_length = length ($seq);
+	$stops{$seq_length} = 1;
+	$seq_length--;
+	$stops{$seq_length} = 1;
+	$seq_length--;
+	$stops{$seq_length} = 1;
+  }
+  my @stop_codons = @{$self->{stop_codons}};
+  foreach my $stop_codon (@stop_codons) {
+	$stop_codon = lc $stop_codon;
+	print "Searching for stop codon: ($stop_codon).\n" if $SEE;
+	my $stop_pos = index ($seq, $stop_codon);
+	while ($stop_pos != -1) {
+	  $stops{$stop_pos} = 1;
+	  $stop_pos = index ($seq, $stop_codon, ($stop_pos + 1)); #include the stop codon too.
+	}
+  }
+  my @stops = sort {$a<=>$b} keys %stops;
+  return (@stops);
+}
+
+
+sub revcomp {
+    my ($seq) = @_;
+    my $reversed_seq = reverse ($seq);
+    $reversed_seq =~ tr/ACGTacgtyrkm/TGCAtgcarymk/;
+    return ($reversed_seq);
+}
+
+
+sub revcomp_coord {
+    my ($coord, $seq_length) = @_;
+    return ($seq_length - $coord + 1);
+}
+
+
+
+
+   
+1;
+
+
diff --git a/PerlLib/Nuc_translator.pm b/PerlLib/Nuc_translator.pm
new file mode 100755
index 0000000..f8c7e4c
--- /dev/null
+++ b/PerlLib/Nuc_translator.pm
@@ -0,0 +1,415 @@
+#!/usr/bin/env perl
+
+package main;
+our $SEE;
+
+package Nuc_translator;
+
+use strict;
+require Exporter;
+use Carp;
+
+
+our @ISA = qw (Exporter);
+our @EXPORT = qw (translate_sequence get_protein reverse_complement);
+
+use vars qw ($currentCode %codon_table $init_codon_table_subref);
+
+
+=head1 NAME
+
+package Nuc_translator.pm
+
+
+=head1 SYNOPSIS
+
+Nuc_translator::use_specified_genetic_code ("universal"); 
+
+my $nuc_sequence = "atgaaagggccctga";
+
+my $translation_frame = 1;
+
+my $protein = &translate_sequence($nuc_sequence, $translation_frame);
+
+
+=head1 DESCRIPTION
+
+Methods are provided to translate nucleotide sequences into protein sequences using a specified genetic code.
+
+Available genetic codes include universal, Euplotes, Tetrahymena, Candida, Acetabularia
+
+For info on these codes, visit:
+
+http://golgi.harvard.edu/biolinks/gencode.html
+
+Methods exported by this package include:
+
+translate_sequence() 
+
+get_protein() 
+
+reverse_complement()
+
+To change the translation code, the following fully qualified method must be used:
+
+Nuc_translator::use_specified_genetic_code()
+
+
+=head1 Methods
+
+
+=cut
+
+
+
+## See http://golgi.harvard.edu/biolinks/gencode.html
+my %SUPPORTED_GENETIC_CODES = ( universal => 1,
+                                Euplotes => 1,
+                                Tetrahymena => 1,
+                                Candida => 1,
+                                Acetabularia => 1,
+                                'Mitochondrial-Canonical' => 1,
+                                'Mitochondrial-Vertebrates' => 1,
+                                'Mitochondrial-Arthropods' => 1,
+                                'Mitochondrial-Echinoderms' => 1,
+                                'Mitochondrial-Molluscs' => 1,
+                                'Mitochondrial-Ascidians' => 1,
+                                'Mitochondrial-Nematodes' => 1,
+                                'Mitochondrial-Platyhelminths' => 1,
+                                'Mitochondrial-Yeasts' => 1,
+                                'Mitochondrial-Euascomycetes' => 1,
+                                'Mitochondrial-Protozoans' => 1,
+                                );
+
+
+
+=over 4
+
+=item translate_sequence()
+
+B<Description:> translates a nucleotide sequence given a specific frame 1-6.
+
+B<Parameters:> $nuc_sequence, $frame
+
+B<Returns:> $protein_sequence
+
+=back
+
+=cut
+
+
+
+sub translate_sequence {
+  my ($sequence, $frame) = @_;
+    
+    $sequence = uc ($sequence);
+	$sequence =~ tr/U/T/;
+    my $seq_length = length ($sequence);
+    unless ($frame >= 1 and $frame <= 6) { 
+		confess "Frame $frame is not allowed. Only between 1 and 6";
+	}
+	
+	if ($frame > 3) {
+		# on reverse strand. Revcomp the sequence and reset the frame
+		$sequence = &reverse_complement($sequence);
+		if ($frame == 4) {
+			$frame = 1;
+		}
+		elsif ($frame == 5) {
+			$frame = 2;
+		}
+		elsif ($frame == 6) {
+			$frame = 3;
+		}
+	}
+	
+    $sequence =~ tr/T/U/;
+    my $start_point = $frame - 1;
+    my $protein_sequence;
+    for (my $i = $start_point; $i < $seq_length; $i+=3) {
+        my $codon = substr($sequence, $i, 3);
+        my $amino_acid;
+        if (exists($codon_table{$codon})) {
+            $amino_acid = $codon_table{$codon};
+        } else {
+            if (length($codon) == 3) {
+                $amino_acid = 'X';
+            } else {
+                $amino_acid = "";
+            }
+        }
+        $protein_sequence .= $amino_acid;
+    }
+    return($protein_sequence);
+}
+
+
+
+=over 4
+
+=item get_protein()
+
+B<Description:> translates nucleotide sequence into a protein sequence.  All 3 forward translation frames are tried
+and the first reading frame found to translate without stop codons is returned.  If all 3 frames provide stop codons, the protein with the least number of stops is returned.
+
+B<Parameters:> $nucleotide_sequence
+
+B<Returns:> $protein_sequence
+
+=back
+
+=cut
+
+
+
+sub get_protein {
+    my ($sequence) = @_;
+    
+    ## Assume frame 1 unless multiple stops appear.
+    my $least_stops = undef();
+    my $least_stop_prot_seq = "";
+    foreach my $forward_frame (1, 2, 3) {
+        my $protein = &translate_sequence($sequence, $forward_frame);
+        my $num_stops = &count_stops_in_prot_seq($protein);
+        if ($num_stops == 0) {
+            return ($protein);
+        } else {
+            if (!defined($least_stops)) {
+                #initialize data
+                $least_stops = $num_stops;
+                $least_stop_prot_seq = $protein;
+            } elsif ($num_stops < $least_stops) {
+                $least_stops = $num_stops;
+                $least_stop_prot_seq = $protein;
+            } else {
+                #keeping original $num_stops and $least_stop_prot_seq
+            }
+        }
+    }
+    return ($least_stop_prot_seq);
+}
+
+
+=over 4
+
+=item reverse_complement()
+
+B<Description:> reverse complements a nucleotide sequence
+
+B<Parameters:> $nucleotide_sequence
+
+B<Returns:> $nucleotide_sequence_rev_comped
+
+=back
+
+=cut
+
+
+
+sub reverse_complement {
+    my($s) = @_;
+    my ($rc);
+    $rc = reverse ($s);
+    $rc =~tr/ACGTacgtyrkmYRKM/TGCAtgcarymkRYMK/;
+    return($rc);
+}
+
+
+####
+sub count_stops_in_prot_seq {
+    my ($prot_seq) = @_;
+    chop $prot_seq; #remove trailing stop.
+    my $stop_num = 0;
+    while ($prot_seq =~ /\*/g) {
+        $stop_num++;
+    } 
+    return ($stop_num);
+}
+
+
+
+####
+sub use_specified_genetic_code {
+    
+    my ($special_code) = @_;
+    print STDERR "using special genetic code $special_code\n" if $SEE;
+    unless ($SUPPORTED_GENETIC_CODES{$special_code}) {
+        die "Sorry, $special_code is not currently supported or recognized.\n";
+    }
+    &$init_codon_table_subref(); ## Restore default universal code.  Others are variations on this.
+    $currentCode = $special_code;
+    
+    if ($special_code eq "Euplotes") {
+        $codon_table{UGA} = "C";
+    } 
+    
+    elsif ($special_code eq "Tetrahymena" || $special_code eq "Acetabularia") {
+        $codon_table{UAA} = "Q";
+        $codon_table{UAG} = "Q";
+    }
+    
+    elsif ($special_code eq "Candida") {
+        $codon_table{CUG} = "S";
+    }
+    
+    elsif ($special_code =~ /Mitochondrial/) {
+        &_set_mitochondrial_code($special_code);
+    }
+ 
+    else {
+        ## shouldn't ever get here anyway.
+        confess "Error, code $special_code is not recognized.\n";
+    }
+    
+    
+}
+
+
+####
+sub _set_mitochondrial_code {
+    my $code = shift;
+    ## set canonical by default:
+    $codon_table{AUA} = "I";
+    $codon_table{AAA} = "K";
+    $codon_table{AGA} = $codon_table{AGG} = "R";
+    $codon_table{CAU} = $codon_table{CAG} = $codon_table{CAC} = $codon_table{CAA} = "L";
+    
+
+    if ($code eq "Mitochondrial-Vertebrates") {
+        $codon_table{UGA} = "W";
+        $codon_table{AUA} = "M";
+        $codon_table{AGA} = "*";
+        $codon_table{AGG} = "*";
+    }
+    elsif ($code eq "Mitochondrial-Arthropods") {
+        $codon_table{UGA} = "W";
+        $codon_table{AUA} = "M";
+        $codon_table{AGA} = "S";
+    }
+    
+    else {
+        confess "Sorry, $code hasn't been fully implemented yet.";
+    }
+   
+
+    ## need to finish
+
+
+    return;
+}
+
+
+
+####
+sub get_stop_codons {
+    my @stop_codons;
+    foreach my $codon (keys %codon_table) {
+        if ($codon_table{$codon} eq '*') {
+            push (@stop_codons, $codon);
+        }
+    }
+    foreach my $codon (@stop_codons) {
+        $codon =~ tr/U/T/;
+    }
+    return (@stop_codons);
+}
+
+
+BEGIN {
+  $init_codon_table_subref = sub {
+	print STDERR "initing codon table.\n" if $SEE;
+    ## Set to Universal Genetic Code
+    $currentCode = "universal";
+    
+    %codon_table = (    UUU => 'F',
+                        UUC => 'F',
+                        UUA => 'L',
+                        UUG => 'L',
+                        
+                        CUU => 'L',
+                        CUC => 'L',
+                        CUA => 'L',
+                        CUG => 'L',
+                        
+                        AUU => 'I',
+                        AUC => 'I',
+                        AUA => 'I',
+                        AUG => 'M',
+                        
+                        GUU => 'V',
+                        GUC => 'V',
+                        GUA => 'V',
+                        GUG => 'V',
+                        
+                        UCU => 'S',
+                        UCC => 'S',
+                        UCA => 'S',
+                        UCG => 'S',
+                        
+                        CCU => 'P',
+                        CCC => 'P',
+                        CCA => 'P',
+                        CCG => 'P',
+                        
+                        ACU => 'T',
+                        ACC => 'T',
+                        ACA => 'T',
+                        ACG => 'T',
+                        
+                        GCU => 'A',
+                        GCC => 'A',
+                        GCA => 'A',
+                        GCG => 'A',
+                        
+                        UAU => 'Y',
+                        UAC => 'Y',
+                        UAA => '*',
+                        UAG => '*',
+                        
+                        CAU => 'H',
+                        CAC => 'H',
+                        CAA => 'Q',
+                        CAG => 'Q',
+                        
+                        AAU => 'N',
+                        AAC => 'N',
+                        AAA => 'K',
+                        AAG => 'K',
+                        
+                        GAU => 'D',
+                        GAC => 'D',
+                        GAA => 'E',
+                        GAG => 'E',
+                        
+                        UGU => 'C',
+                        UGC => 'C',
+                        UGA => '*',
+                        UGG => 'W',
+                        
+                        CGU => 'R',
+                        CGC => 'R',
+                        CGA => 'R',
+                        CGG => 'R',
+                        
+                        AGU => 'S',
+                        AGC => 'S',
+                        AGA => 'R',
+                        AGG => 'R',
+                        
+                        GGU => 'G',
+                        GGC => 'G',
+                        GGA => 'G',
+                        GGG => 'G'    
+                        
+                        );
+  };
+
+  &$init_codon_table_subref();
+}
+
+
+1; #end of module
+
+
+
+
diff --git a/PerlLib/TiedHash.pm b/PerlLib/TiedHash.pm
new file mode 100755
index 0000000..70fd9cb
--- /dev/null
+++ b/PerlLib/TiedHash.pm
@@ -0,0 +1,199 @@
+#!/usr/local/bin/perl
+
+package TiedHash;
+use strict;
+use warnings;
+use DB_File;
+use Carp;
+
+=example
+
+	my $tied_hash = new TiedHash( { create => "$pfam_db.inx" } );
+
+
+    my $acc = "";
+
+    while (<$fh>) {
+	   chomp;
+       my ($token, $rest) = split (/\s+/, $_, 2);
+       if ($token eq 'NAME') {
+	      $acc = $rest;
+       }
+       elsif ($token =~ /^(NC|TC|DESC|ACC)$/) {
+	   my $key = "$acc$;$token";
+       $tied_hash->store_key_value($key, $rest);
+       print STDERR "storing: $key, $rest\n";
+   }
+
+
+=cut
+
+
+sub new {
+    my $packagename = shift;
+    
+    my $prefs_href = shift;
+    
+    if ($prefs_href && ! ref $prefs_href) {
+        confess "Error, need hash reference with opts in constructor.\n";
+    }
+    
+
+    my $self = { 
+        index_filename => undef,
+        tied_index => {},
+        tie_invoked => 0,
+    };
+    
+    bless ($self, $packagename);
+    
+
+    if (ref $prefs_href eq "HASH") {
+        if (my $index_file = $prefs_href->{"create"}) {
+            $self->create_index_file($index_file);
+        }
+        elsif ($index_file = $prefs_href->{"use"}) {
+            $self->use_index_file($index_file);
+        }
+    }
+            
+    
+    return ($self);
+}
+
+####
+sub tie_invoked {
+    my $self = shift;
+    return ($self->{tie_invoked});
+}
+
+
+####
+sub DESTROY {
+    my $self = shift;
+    if ($self->{index_filename}) {
+        # hash must have been tied
+        # so, untie it
+        untie (%{$self->{tied_index}});
+    }
+}
+
+
+####
+sub create_index_file {
+    my $self = shift;
+    return ($self->make_index_file(@_));
+}
+
+
+
+####
+sub make_index_file {
+    my $self = shift;
+    my $filename = shift;
+    
+    unless ($filename) {
+        confess "need filename as parameter";
+    }
+
+    if (-e $filename) {
+        unlink $filename or confess "cannot remove existing index filename $filename";
+    }
+    
+    $self->{index_filename} = $filename;
+    
+    tie (%{$self->{tied_index}}, 'DB_File', $filename, O_CREAT|O_RDWR, 0666, $DB_BTREE);
+
+    $self->{tie_invoked} = 1;
+    
+    return;
+}
+
+
+####
+sub use_index_file {
+    my $self = shift;
+    my $filename = shift;
+    
+    unless ($filename) {
+        confess "need filename as parameter";
+    }
+    
+    unless (-s $filename) {
+        confess "Error, cannot locate file: $filename\n";
+    }
+    
+    $self->{index_filename} = $filename;
+    
+    tie (%{$self->{tied_index}}, 'DB_File', $filename, O_RDONLY, 0, $DB_BTREE);
+
+    $self->{tie_invoked} = 1;
+
+    #my @keys = $self->get_keys();
+    #unless (@keys) {
+    #    confess "Error, tried using $filename db, but couldn't perform retrievals.\n";
+    #}
+    
+    return;
+
+}
+
+
+####
+sub store_key_value {
+    my ($self, $identifier, $value)  = @_;
+    
+    #my $num_keys = scalar ($self->get_keys());
+    
+    unless ($self->tie_invoked()) {
+        confess "Error, cannot store key/value pair since tied hash not created.\n";
+    }
+
+
+    my $found = 0;
+    while (! $found) {
+        $self->{tied_index}->{$identifier} = $value;
+        
+        my $val = $self->get_value($identifier);
+        if (defined $val) {
+            $found = 1;
+        }
+        else {
+            warn "Berkeley DB had trouble storing ($identifier); trying again.\n";
+        }
+    }
+    
+    return;
+    
+}
+
+
+####
+sub get_value {
+    my $self = shift;
+    my $identifier = shift;
+
+      
+    unless ($self->tie_invoked()) {
+        confess "Error, cannot retrieve value from untied hash\n";
+    }
+
+    my $value = $self->{tied_index}->{$identifier};
+    
+    return ($value);
+}
+
+
+## 
+sub get_keys {
+    my $self = shift;
+    
+    unless ($self->tie_invoked()) {
+        confess "Error, cannot retrieve values from untied hash\n";
+    }
+    
+    return (keys %{$self->{tied_index}});
+}
+
+
+1; #EOM
diff --git a/README b/README
new file mode 100644
index 0000000..70f725e
--- /dev/null
+++ b/README
@@ -0,0 +1,27 @@
+TransDecoder
+
+TransDecoder identifies candidate coding regions within transcript sequences, such as those generated by de novo RNA-Seq transcript assembly using Trinity, or constructed based on RNA-Seq alignments to the genome using Tophat and Cufflinks.
+
+It uses the following criteria:
+
+*    a minimum length open reading frame (ORF) is found in a transcript sequence
+*    a log-likelihood score similar to what is computed by the GeneID software is > 0.
+*    the above coding score is greatest when the ORF is scored in the 1st reading frame as compared to scores in the other 5 reading frames.
+*    if a candidate ORF is found fully encapsulated by the coordinates of another candidate ORF, the longer one is reported. However, a single transcript can report multiple ORFs (allowing for operons, chimeras, etc).
+*    optional the putative peptide has a match to a Pfam domain above the noise cutoff score.
+
+The software is primarily maintained by Brian Haas at the Broad Institute and Alexie Papanicolaou at the Commonwealth Scientific and Industrial Research Organisation (CSIRO). It is integrated into other related software such as Trinity, PASA, EVidenceModeler, and Trinotate.
+
+Full documentation is provided at: http://transdecoder.github.io
+
+===========================
+To build just TransDecoder:
+
+Please make sure you read http://transdecoder.github.io before proceeding.
+
+   %  make
+
+
+See sample_data/ directory for example usage.
+
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f2ef89f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,26 @@
+# TransDecoder
+
+TransDecoder identifies candidate coding regions within transcript sequences, such as those generated by de novo RNA-Seq transcript assembly using Trinity, or constructed based on RNA-Seq alignments to the genome using Tophat and Cufflinks.
+
+It uses the following criteria:
+
+*    a minimum length open reading frame (ORF) is found in a transcript sequence
+*    a log-likelihood score similar to what is computed by the GeneID software is > 0.
+*    the above coding score is greatest when the ORF is scored in the 1st reading frame as compared to scores in the other 5 reading frames.
+*    if a candidate ORF is found fully encapsulated by the coordinates of another candidate ORF, the longer one is reported. However, a single transcript can report multiple ORFs (allowing for operons, chimeras, etc).
+*    optional the putative peptide has a match to a Pfam domain above the noise cutoff score.
+
+The software is primarily maintained by Brian Haas at the Broad Institute and Alexie Papanicolaou at the Commonwealth Scientific and Industrial Research Organisation (CSIRO). It is integrated into other related software such as Trinity, PASA, EVidenceModeler, and Trinotate.
+
+Full documentation is provided at: http://transdecoder.github.io
+
+===========================
+To build just TransDecoder:
+
+Please make sure you read http://transdecoder.sf.net before proceeding.
+
+   %  make
+
+See sample_data/ directory for example usage.
+
+
diff --git a/Release.Notes b/Release.Notes
new file mode 100644
index 0000000..025a2ba
--- /dev/null
+++ b/Release.Notes
@@ -0,0 +1,20 @@
+
+## 2015-01-26   v2.0 release
+
+-overhauled the build
+    -removed the active searching of Pfam and all MPI-related funcitonality
+    -runs in 2 phase:
+        -TransDecoder.LongOrfs : extracs the long ORFs
+        -TransDecoder.Predict : predicts the likely coding regions among the ORFs
+            -step can use Pfam and blastp search results (blast support is a new addition)
+    -run Pfam and/or BlastP searches directly or try using "HPC GridRunner" (http://HpcGridRunner.github.io)
+
+-moved to github 
+
+
+
+## 2014-07-04
+
+-added 'make simple' to build just the essential components involving parafly and cdhit
+-removed the 'cds.' prefix from the pep and cds sequence accessions.
+
diff --git a/TransDecoder.LongOrfs b/TransDecoder.LongOrfs
new file mode 100755
index 0000000..df68e5b
--- /dev/null
+++ b/TransDecoder.LongOrfs
@@ -0,0 +1,301 @@
+#!/usr/bin/env perl
+
+my $MIN_PROT_LENGTH = 100;
+
+=pod
+
+=head1 NAME
+
+L<Transdecoder.LongOrfs|http://transdecoder.github.io> - Transcriptome Protein Prediction
+
+=head1 USAGE
+
+Required:
+
+ -t <string>                            transcripts.fasta
+
+Optional:
+
+ -m <int>                               minimum protein length (default: 100)
+ 
+ -G <string>                            genetic code (default: universal; see PerlDoc; options: Euplotes, Tetrahymena, Candida, Acetabularia)
+
+ -S                                     strand-specific (only analyzes top strand)
+
+=head1 Genetic Codes
+
+See L<http://golgi.harvard.edu/biolinks/gencode.html>. These are currently supported:
+
+ universal (default)
+ Euplotes
+ Tetrahymena
+ Candida
+ Acetabularia
+ Mitochondrial-Canonical
+ Mitochondrial-Vertebrates
+ Mitochondrial-Arthropods
+ Mitochondrial-Echinoderms
+ Mitochondrial-Molluscs
+ Mitochondrial-Ascidians
+ Mitochondrial-Nematodes
+ Mitochondrial-Platyhelminths
+ Mitochondrial-Yeasts
+ Mitochondrial-Euascomycetes
+ Mitochondrial-Protozoans
+
+
+=cut
+
+
+use strict;
+use warnings;
+use FindBin;
+use Pod::Usage;
+use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
+use Data::Dumper;
+use List::Util qw (min max);
+use File::Basename;
+
+use lib ("$FindBin::RealBin/PerlLib");
+
+use POSIX qw(ceil);
+use Gene_obj;
+use Nuc_translator;
+use Fasta_reader;
+use Longest_orf;
+
+my $UTIL_DIR = "$FindBin::RealBin/util";
+$ENV{PATH} = "$UTIL_DIR/bin:$ENV{PATH}";
+
+
+my ($transcripts_file);
+
+my $genetic_code='universal';
+
+my $TOP_STRAND_ONLY = 0;
+
+my $help;
+my $workdir;
+my $verbose;
+my $search_pfam = "";
+my ($reuse,$pfam_out);
+
+
+my $MPI_DEBUG = 1;
+
+&GetOptions( 't=s' => \$transcripts_file,
+             'm=i' => \$MIN_PROT_LENGTH,
+             'G=s' => \$genetic_code,
+             'h' => \$help,
+             'v' => \$verbose,
+             'S' => \$TOP_STRAND_ONLY, 
+             );
+
+
+
+pod2usage(-verbose => 2, -output => \*STDERR) if ($help);
+
+if (@ARGV) {
+    die "Error, don't understand options: @ARGV";
+}
+
+our $SEE = $verbose;
+
+pod2usage(-verbose => 2,
+          -output => \*STDERR) unless ($transcripts_file && -s $transcripts_file);
+
+if ($genetic_code ne 'universal') {
+    &Nuc_translator::use_specified_genetic_code($genetic_code);
+}
+
+
+
+main: {
+    my $workdir = basename($transcripts_file) . ".transdecoder_dir"; 
+    unless (-d $workdir) {
+        mkdir($workdir) or die "Error, cannot mkdir $workdir.";
+    }
+    
+    
+    my $base_freqs_file = "$workdir/base_freqs.dat";
+    my $base_freqs_checkpoint = "$base_freqs_file.ok";
+    if (! -e $base_freqs_checkpoint) {
+        print STDERR "\n\n-first extracting base frequencies, we'll need them later.\n";
+        my $cmd = "$UTIL_DIR/compute_base_probs.pl $transcripts_file $TOP_STRAND_ONLY > $base_freqs_file";
+        &process_cmd($cmd);
+        
+        &process_cmd("touch $base_freqs_checkpoint");
+    }
+    
+
+
+    my $prefix = "$workdir/longest_orfs";
+    my $cds_file = "$prefix.cds";
+    my $gff3_file = "$prefix.gff3";
+    my $pep_file = "$prefix.pep";
+    
+	open (PEP, ">$pep_file") or die $!;
+	open (CDS, ">$cds_file") or die $!; 
+	open (GFF, ">$gff3_file") or die $!;
+	
+
+    print STDERR "\n\n- extracting ORFs from transcripts.\n";
+	
+	my $model_counter = 0;
+	my $trans_counter = 0;
+    
+    my $num_total_trans = `grep '>' $transcripts_file | wc -l`;
+    chomp $num_total_trans;
+    print STDERR "-total transcripts to examine: $num_total_trans\n";
+
+	my $fasta_reader = new Fasta_reader($transcripts_file);
+	while (my $seq_obj = $fasta_reader->next()) {
+		
+        $trans_counter++;
+        my $percent_done = sprintf("%.2f", $trans_counter/$num_total_trans*100);
+        print STDERR "\r[$trans_counter/$num_total_trans] = $percent_done\% done    " if $trans_counter % 100 == 0;
+                
+
+		my $acc = $seq_obj->get_accession();
+		my $sequence = $seq_obj->get_sequence();
+		
+		my $longest_orf_finder = new Longest_orf();
+		$longest_orf_finder->allow_5prime_partials();
+		$longest_orf_finder->allow_3prime_partials();
+		
+	    if ($TOP_STRAND_ONLY) {
+			$longest_orf_finder->forward_strand_only();
+		}
+		
+		my @orf_structs = $longest_orf_finder->capture_all_ORFs($sequence);
+		
+		@orf_structs = reverse sort {$a->{length}<=>$b->{length}} @orf_structs;
+		
+        while (@orf_structs) {
+            my $orf = shift @orf_structs;
+            
+            my $start = $orf->{start};
+            my $stop = $orf->{stop};
+            
+            my $length = int((abs($start-$stop)+1)/3); 
+            my $orient = $orf->{orient};
+            my $protein = $orf->{protein};            
+            
+            ##################################
+            # adjust for boundary conditions, since starts and stops run off the ends of the sequences at partial codons
+            #################################
+            
+            # adjust at 3' end
+            if ($stop > length($sequence)) {
+                $stop -= 3;
+            }
+            if ($start > length($sequence)) {
+                $start -= 3;
+            }
+            
+            # adjust at 5' end
+            if ($stop < 1) {
+                $stop += 3;
+            }
+            if ($start < 1) {
+                $start += 3;
+            }
+            
+                        
+            if ($length < $MIN_PROT_LENGTH) { next; }
+            
+            my $cds_coords_href = { $start => $stop };
+            my $exon_coords_href = ($start < $stop) ? { 1 => length($sequence) } : { length($sequence) => 1 };
+            
+            my $gene_obj = new Gene_obj();
+            
+
+            $gene_obj->populate_gene_object($cds_coords_href, $exon_coords_href);
+            $gene_obj->{asmbl_id} = $acc;
+            
+            $model_counter++;
+
+            my $model_id = "$acc|m.$model_counter";
+            my $gene_id = "$acc|g.$model_counter";
+            
+            $gene_obj->{TU_feat_name} = $gene_id;
+            $gene_obj->{Model_feat_name} = $model_id;
+
+            
+            my $cds = $gene_obj->create_CDS_sequence(\$sequence);
+
+            unless ($cds) {
+                die "Error, no CDS for gene: " . Dumper($cds_coords_href) . Dumper($exon_coords_href);
+            }
+
+            my $got_start = 0;
+            my $got_stop = 0;
+            if ($protein =~ /^M/) {
+                $got_start = 1;
+            } 
+            if ($protein =~ /\*$/) {
+                $got_stop = 1;
+            }
+            
+            my $prot_type = "";
+            if ($got_start && $got_stop) {
+                $prot_type = "complete";
+            } elsif ($got_start) {
+                $prot_type = "3prime_partial";
+            } elsif ($got_stop) {
+                $prot_type = "5prime_partial";
+            } else {
+                $prot_type = "internal";
+            }
+            
+            $gene_obj->{com_name} = "ORF $gene_id $model_id type:$prot_type len:$length ($orient)";            
+            
+            # this header is identical between CDS and PEP (since PEP is just a direct translation of CDS for a specific translation table)
+            # we are currently not printing this out at the final data but it would be nice to.
+            my $pep_header = ">$model_id $gene_id type:$prot_type len:$length gc:$genetic_code $acc:$start-$stop($orient)\n";
+            my $cds_header = ">$model_id $gene_id type:$prot_type len:$length $acc:$start-$stop($orient)\n";
+            
+            print PEP $pep_header."$protein\n";
+                        
+            print CDS $cds_header."$cds\n";
+            
+            print GFF $gene_obj->to_GFF3_format(source => "transdecoder") . "\n";
+            
+        }
+	}
+
+    close PEP;
+    close CDS;
+    close GFF;
+    
+    
+    print STDERR "\n\n#################################\n"
+                  . "### Done preparing long ORFs.  ###\n"
+                  . "##################################\n\n";
+
+    print STDERR "\tUse file: $pep_file  for Pfam and/or BlastP searches to enable homology-based coding region identification.\n\n";
+    
+    print STDERR "\tThen, run TransDecoder.Predict for your final coding region predictions.\n\n\n";
+    
+
+    exit(0);
+
+    
+}
+
+
+####
+sub process_cmd {
+	my ($cmd) = @_;
+
+	print "CMD: $cmd\n";
+	my $ret = system($cmd);
+
+	if ($ret) {
+		die "Error, cmd: $cmd died with ret $ret";
+	}
+	
+	return;
+
+}
+
diff --git a/TransDecoder.Predict b/TransDecoder.Predict
new file mode 100755
index 0000000..912d921
--- /dev/null
+++ b/TransDecoder.Predict
@@ -0,0 +1,364 @@
+#!/usr/bin/env perl
+
+=pod
+
+=head1 NAME
+
+L<Transdecoder|http://transdecoder.sourceforge.net> - Transcriptome Protein Prediction
+
+=head1 USAGE
+
+Required:
+
+ -t <string>                            transcripts.fasta
+
+Common options:
+ 
+ --retain_long_orfs <int>               retain all ORFs found that are equal or longer than these many nucleotides even if no other evidence 
+                                         marks it as coding (default: 900 bp => 300aa)
+
+ --retain_pfam_hits <string>                 /path/to/pfam_db.hmm to search 
+                                        using hmmscan (which should be accessible via your PATH setting)
+ 
+ --retain_blastp_hits <string>
+
+
+
+Advanced options
+
+ --train <string>                       FASTA file with ORFs to train Markov Mod for protein identification; otherwise 
+                                        longest non-redundant ORFs used
+
+ -T <int>                               If no --train, top longest ORFs to train Markov Model (hexamer stats) (default: 500)
+
+
+=cut
+
+
+use strict;
+use warnings;
+use FindBin;
+use Pod::Usage;
+use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
+use Data::Dumper;
+use List::Util qw (min max);
+use File::Basename;
+
+use lib ("$FindBin::RealBin/PerlLib");
+
+use POSIX qw(ceil);
+use Gene_obj;
+use Nuc_translator;
+use Fasta_reader;
+use Longest_orf;
+
+my $UTIL_DIR = "$FindBin::RealBin/util";
+$ENV{PATH} = "$UTIL_DIR/bin:$ENV{PATH}";
+
+
+my ($cd_hit_est_exec) = &check_program('cd-hit-est');
+
+my ($transcripts_file,$train_file);
+
+my $top_ORFs_train = 500;
+
+
+my $help;
+
+my $verbose;
+my $search_pfam = "";
+my ($reuse,$pfam_out);
+
+my $RETAIN_LONG_ORFS = 900;
+
+
+my $retain_pfam_hits_file;
+my $retain_blastp_hits_file;
+
+my $MPI_DEBUG = 1;
+
+&GetOptions( 't=s' => \$transcripts_file,
+             'train:s' => \$train_file,
+
+             'h' => \$help,
+             'v' => \$verbose,
+             
+             'T=i' => \$top_ORFs_train,
+
+             'search_pfam=s' => \$search_pfam,
+             'reuse' => \$reuse,
+
+             'retain_long_orfs=i' => \$RETAIN_LONG_ORFS,
+
+             'debug' => \$MPI_DEBUG,
+             
+             'retain_pfam_hits=s' => \$retain_pfam_hits_file,
+             'retain_blastp_hits=s' => \$retain_blastp_hits_file,
+             
+             );
+
+
+
+pod2usage(-verbose => 2, -output => \*STDERR) if ($help);
+
+if (@ARGV) {
+    die "Error, don't understand options: @ARGV";
+}
+
+
+our $SEE = $verbose;
+
+pod2usage(-verbose => 2, -output => \*STDERR, -message => "No transcript file (-t)\n") unless ($transcripts_file && -s $transcripts_file);
+
+main: {
+    my $workdir = basename($transcripts_file) . ".transdecoder_dir"; 
+    
+    unless (-d $workdir) {
+        die "Error, cannot find directory: $workdir,  be sure to first run TransDecoder.LongOrfs before TransDecoder.Predict\n\n";
+    }
+        
+    my $prefix = "$workdir/longest_orfs";
+    my $cds_file = "$prefix.cds";
+    my $gff3_file = "$prefix.gff3";
+    my $pep_file = "$prefix.pep";
+
+    ## Train a Markov model based on user-provided file or longest candidate CDS sequences, score all candidates, and select the final set.
+
+    my $top_cds_file;
+    if ($train_file) {
+
+        if (! -s $train_file) {
+            die "Error, cannot locate train file: $train_file";
+        }
+        $top_cds_file = $train_file;
+    }
+    else {
+        $top_cds_file = "$cds_file.top_${top_ORFs_train}_longest";
+        my $checkpoint = "$top_cds_file.ok";
+        if (! -e $checkpoint) {
+                    
+            # to speed things up only check for redundancy up to x the number of entries we want
+            my $red_num = $top_ORFs_train * 10;
+            my $red_num_cds_longest_file = "$cds_file.top_longest_${red_num}";
+            &process_cmd("$UTIL_DIR/get_top_longest_fasta_entries.pl $cds_file $red_num > $red_num_cds_longest_file");
+            &process_cmd("$cd_hit_est_exec -r 1 -i $red_num_cds_longest_file -c 0.80 -o $red_num_cds_longest_file.nr80 -M 0 ");
+            &process_cmd("$UTIL_DIR/get_top_longest_fasta_entries.pl $red_num_cds_longest_file.nr80 $top_ORFs_train > $top_cds_file");
+        
+            &process_cmd("touch $checkpoint");
+
+        }
+    }
+    
+    
+    # get hexamer scores
+    my $hexamer_scores_file = "$workdir/hexamer.scores";
+    my $hexamer_checkpoint = "$hexamer_scores_file.ok";
+    if (! -e $hexamer_checkpoint) {
+        
+        my $base_freqs_file = "$workdir/base_freqs.dat";
+
+        my $cmd = "$UTIL_DIR/seq_n_baseprobs_to_logliklihood_vals.pl $top_cds_file $base_freqs_file > $hexamer_scores_file";
+        &process_cmd($cmd);
+
+        &process_cmd("touch $hexamer_checkpoint");
+    }
+
+    # score all cds entries
+    my $cds_scores_file = "$cds_file.scores";
+    my $cds_scores_checkpoint = "$cds_scores_file.ok";
+    if (! -e $cds_scores_checkpoint) {
+        my $cmd = "$UTIL_DIR/score_CDS_liklihood_all_6_frames.pl $cds_file $hexamer_scores_file > $cds_scores_file";
+        &process_cmd($cmd);
+        
+        &process_cmd("touch $cds_scores_checkpoint");
+    }
+
+    ## Retain those that have pfam matches
+
+    my %has_pfam_hit;
+    
+    if ($retain_pfam_hits_file) {
+        %has_pfam_hit = &parse_pfam_hits($retain_pfam_hits_file);
+    }
+
+    my %has_blastp_hit;
+    if ($retain_blastp_hits_file) {
+        %has_blastp_hit = &parse_blastp_hits_file($retain_blastp_hits_file);
+    }
+    
+    # get accs for best entries
+    my $acc_file = "$cds_file.scores.selected";
+    {
+        open (my $ofh, ">$acc_file") or die "Error, cannot write to $acc_file";
+        open (my $ifh, "$cds_file.scores") or die "Error, cannot open file $cds_file.scores";
+        while (<$ifh>) {
+            chomp;
+            my ($acc, $orf_length, @scores) = split(/\t/);
+            
+            my $score_1 = shift @scores;
+            my $max_score_other_frame = max(@scores);
+            if ($has_pfam_hit{$acc} 
+                || 
+                $has_blastp_hit{$acc}
+                ||
+                $orf_length >= $RETAIN_LONG_ORFS
+                ||
+                ($score_1 > 0 && $score_1 > $max_score_other_frame)
+                ) { 
+                print $ofh "$acc\n";
+                
+                if ($has_pfam_hit{$acc}) {
+                    print STDERR "-$acc flagged as having a pfam domain.\n" if $verbose;
+                }
+                if ($has_blastp_hit{$acc}) {
+                    print STDERR "-$acc flagged as having a blastp match.\n" if $verbose;
+                }
+                
+            }
+        }
+        close $ifh;
+        close $ofh;
+    }
+    
+    # index the current gff file:
+    my $cmd = "$UTIL_DIR/index_gff3_files_by_isoform.pl $gff3_file";
+    &process_cmd($cmd);
+    
+    # retrieve the best entries:
+    $cmd = "$UTIL_DIR/gene_list_to_gff.pl $acc_file $gff3_file.inx > $cds_file.best_candidates.gff3";
+    &process_cmd($cmd);
+    
+    
+    ##############################
+    ## Generate the final outputs.
+    ##############################
+
+    my $final_output_prefix = basename($transcripts_file) . ".transdecoder";
+    
+    {
+                        
+        # exclude shadow orfs (smaller orfs in different reading frame that are eclipsed by longer orfs)
+        $cmd = "$UTIL_DIR/remove_eclipsed_ORFs.pl $cds_file.best_candidates.gff3 > $final_output_prefix.gff3";
+        &process_cmd($cmd);
+                
+        ## write final outputs:
+        
+        ## make a BED file for viewing in IGV
+        my $gff3_file = "$final_output_prefix.gff3";
+        my $bed_file = $gff3_file;
+        $bed_file =~ s/\.gff3$/\.bed/;
+        $cmd = "$UTIL_DIR/gff3_file_to_bed.pl $gff3_file > $bed_file";
+        &process_cmd($cmd);
+        
+    
+        # make a peptide file:
+        my $best_pep_file = $gff3_file;
+        $best_pep_file =~ s/\.gff3$/\.pep/;
+        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl $gff3_file $transcripts_file > $best_pep_file";
+        &process_cmd($cmd);
+        
+        
+
+        # make a CDS file:
+        my $best_cds_file = $best_pep_file;
+        $best_cds_file =~ s/\.pep$/\.cds/;
+        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl $gff3_file $transcripts_file CDS > $best_cds_file";
+        &process_cmd($cmd);
+        
+        # make a CDS file:
+        my $best_cdna_file = $best_pep_file;
+        $best_cdna_file =~ s/\.pep$/\.mRNA/;
+        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl $gff3_file $transcripts_file cDNA > $best_cdna_file";
+        &process_cmd($cmd);
+    
+    }
+    
+    print STDERR "transdecoder is finished.  See output files $final_output_prefix.\*\n\n\n";
+    
+    
+    
+    exit(0);
+}
+
+
+####
+sub process_cmd {
+	my ($cmd) = @_;
+
+	print "CMD: $cmd\n";
+	my $ret = system($cmd);
+
+	if ($ret) {
+		die "Error, cmd: $cmd died with ret $ret";
+	}
+	
+	return;
+
+}
+
+
+
+####
+sub parse_pfam_hits {
+    my ($pfam_hits_file) = @_;
+    
+    my %has_pfam_hit;
+    
+    if (! -e $pfam_hits_file) {
+        die "Error, cannot find pfam hits file: $pfam_hits_file";
+    }
+    
+    print "PFAM output found and processing...\n";
+    # capture those proteins having pfam hits
+    open (my $fh, $pfam_hits_file) or die "Error, cannot open file: $pfam_hits_file";
+    while (my $ln=<$fh>) {
+        next if $ln=~/^\#/;
+        my @x = split(/\s+/,$ln);
+        next unless $x[3];  # domtbl
+        my $orf_acc = $x[3];
+        $has_pfam_hit{$orf_acc} = 1;
+    }
+    close $fh;
+    
+    
+    return(%has_pfam_hit);
+}
+
+####
+sub parse_blastp_hits_file {
+    my ($blastp_file) = @_;
+
+    unless (-e $blastp_file) {
+        die "Error, cannot find file $blastp_file";
+    }
+
+    my %blastp_hits;
+
+    open (my $fh, $blastp_file) or die "Error, cannot open file $blastp_file";
+    while (<$fh>) {
+        chomp;
+        my @x = split(/\t/);
+        my $id = $x[0];
+
+        $blastp_hits{$id} = 1;
+    }
+    close $fh;
+
+    return(%blastp_hits);
+}
+
+
+
+
+sub check_program() {
+ my @paths;
+ foreach my $prog (@_) {
+  my $path = `which $prog`;
+  die "Error, path to a required program ($prog) cannot be found\n\n"
+    unless $path =~ /^\//;
+  chomp($path);
+  $path = readlink($path) if -l $path;
+  push( @paths, $path );
+ }
+ return @paths;
+}
diff --git a/notes b/notes
new file mode 100644
index 0000000..7806c64
--- /dev/null
+++ b/notes
@@ -0,0 +1 @@
+git clone https://github.com/TransDecoder/TransDecoder.git
diff --git a/sample_data/blastp.results.outfmt6.gz b/sample_data/blastp.results.outfmt6.gz
new file mode 100644
index 0000000..09e79bf
Binary files /dev/null and b/sample_data/blastp.results.outfmt6.gz differ
diff --git a/sample_data/cleanme.pl b/sample_data/cleanme.pl
new file mode 100755
index 0000000..f5f72ad
--- /dev/null
+++ b/sample_data/cleanme.pl
@@ -0,0 +1,42 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use FindBin;
+
+
+## we delete all files we don't need in this directory. Be careful in case users try running it somewhere else, outside this dir.
+chdir $FindBin::Bin or die "error, cannot cd to $FindBin::Bin";
+
+
+
+my @files_to_keep = qw (cleanme.pl 
+                        runMe.sh
+                        test.genome.fasta.gz
+                        test.tophat.sam.gz
+                        transcripts.gtf.gz
+                 
+                 
+
+pfam.domtblout.gz
+blastp.results.outfmt6.gz
+                        );
+
+
+my %keep = map { + $_ => 1 } @files_to_keep;
+
+
+foreach my $file (<*>) {
+	
+	if (! $keep{$file}) {
+		print STDERR "-removing file: $file\n";
+		unlink($file);
+	}
+}
+
+
+`rm -rf ./transcripts.fasta.transdecoder_dir/`;
+
+
+exit(0);
diff --git a/sample_data/pfam.domtblout.gz b/sample_data/pfam.domtblout.gz
new file mode 100644
index 0000000..9b7af55
Binary files /dev/null and b/sample_data/pfam.domtblout.gz differ
diff --git a/sample_data/runMe.sh b/sample_data/runMe.sh
new file mode 100755
index 0000000..fe9ea42
--- /dev/null
+++ b/sample_data/runMe.sh
@@ -0,0 +1,62 @@
+#!/bin/bash -e
+
+if [ -e test.genome.fasta.gz ] && [ ! -e test.genome.fasta ]; then
+    gunzip -c test.genome.fasta.gz > test.genome.fasta
+fi
+
+if [ -e test.tophat.sam.gz ] && [ ! -e test.tophat.sam ]; then
+    gunzip -c test.tophat.sam.gz > test.tophat.sam
+fi
+
+if [ -e transcripts.gtf.gz ] && [ ! -e transcripts.gtf ]; then
+    gunzip -c transcripts.gtf.gz > transcripts.gtf
+fi
+
+if [ -e blastp.results.outfmt6.gz ] && [ ! -e blastp.results.outfmt6 ]; then
+    gunzip -c blastp.results.outfmt6.gz > blastp.results.outfmt6
+fi
+
+if [ -e pfam.domtblout.gz ] && [ ! -e pfam.domtblout ]; then
+    gunzip -c pfam.domtblout.gz > pfam.domtblout
+fi
+
+
+
+## generate alignment gff3 formatted output
+../util/cufflinks_gtf_to_alignment_gff3.pl transcripts.gtf > transcripts.gff3
+
+## generate transcripts fasta file
+../util/cufflinks_gtf_genome_to_cdna_fasta.pl transcripts.gtf test.genome.fasta > transcripts.fasta 
+
+## Extract the long ORFs
+../TransDecoder.LongOrfs -t transcripts.fasta
+
+
+## Predict likely ORFs
+if [ $1 ]; then
+    ## use pfam and blast results:
+    ../TransDecoder.Predict  -t transcripts.fasta --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.results.outfmt6 -v
+else
+    # just coding metrics
+    ../TransDecoder.Predict -t transcripts.fasta 
+fi
+
+## convert to genome coordinates
+../util/cdna_alignment_orf_to_genome_orf.pl transcripts.fasta.transdecoder.gff3 transcripts.gff3 transcripts.fasta > transcripts.fasta.transdecoder.genome.gff3
+
+
+## make bed files for viewing with GenomeView
+
+# covert cufflinks gtf to bed
+../util/cufflinks_gtf_to_bed.pl transcripts.gtf > transcripts.bed
+
+# convert the genome-based gene-gff3 file to bed
+../util/gff3_file_to_bed.pl transcripts.fasta.transdecoder.genome.gff3 > transcripts.fasta.transdecoder.genome.bed
+
+echo
+echo
+echo Done!  Coding region genome annotations provided as: best_candidates.eclipsed_orfs_removed.genome.gff3
+echo
+echo 
+
+exit 0
diff --git a/sample_data/test.genome.fasta.gz b/sample_data/test.genome.fasta.gz
new file mode 100644
index 0000000..7b2f788
Binary files /dev/null and b/sample_data/test.genome.fasta.gz differ
diff --git a/sample_data/test.tophat.sam.gz b/sample_data/test.tophat.sam.gz
new file mode 100644
index 0000000..e2bed47
Binary files /dev/null and b/sample_data/test.tophat.sam.gz differ
diff --git a/sample_data/transcripts.gtf.gz b/sample_data/transcripts.gtf.gz
new file mode 100644
index 0000000..0c7b52b
Binary files /dev/null and b/sample_data/transcripts.gtf.gz differ
diff --git a/transdecoder_plugins/Makefile b/transdecoder_plugins/Makefile
new file mode 100644
index 0000000..ec3f4c9
--- /dev/null
+++ b/transdecoder_plugins/Makefile
@@ -0,0 +1,17 @@
+SHELL := /bin/bash
+
+CDHIT="cd-hit-v4.6.1-2012-08-27"
+
+all: cdhit
+
+
+cdhit:
+	tar -xvf ${CDHIT}.tgz \
+	&& cd ${CDHIT} \
+    && $(MAKE) openmp=yes && $(MAKE) install PREFIX=../../util/bin
+	mv ${CDHIT} cdhit
+
+
+clean:
+	rm -rf ./cdhit
+	rm -f ../util/bin/*
diff --git a/transdecoder_plugins/cd-hit-v4.6.1-2012-08-27.tgz b/transdecoder_plugins/cd-hit-v4.6.1-2012-08-27.tgz
new file mode 100644
index 0000000..7afeb6c
Binary files /dev/null and b/transdecoder_plugins/cd-hit-v4.6.1-2012-08-27.tgz differ
diff --git a/util/bin/.hidden b/util/bin/.hidden
new file mode 100644
index 0000000..e69de29
diff --git a/util/cdna_alignment_orf_to_genome_orf.pl b/util/cdna_alignment_orf_to_genome_orf.pl
new file mode 100755
index 0000000..2284f66
--- /dev/null
+++ b/util/cdna_alignment_orf_to_genome_orf.pl
@@ -0,0 +1,327 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Gene_obj;
+use GFF3_utils;
+use Data::Dumper;
+use Fasta_reader;
+
+my $usage = "\nusage: $0 cdna_orfs.genes.gff3 cdna_genome.alignments.gff3 cdna.fasta\n\n";
+
+my $cdna_orfs_gff3 = $ARGV[0] or die $usage;
+my $cdna_genome_gff3 = $ARGV[1] or die $usage;
+my $cdna_fasta = $ARGV[2] or die $usage;
+
+
+my $WARNING_COUNT = 0; # count those orfs that appear to be on strand opposite from the transcribed strand.
+
+main: {
+
+    my %cdna_seq_lengths = &parse_cdna_seq_lengths($cdna_fasta);
+    
+    my %orf_counter;
+
+    my %cdna_acc_to_transcript_structure = &parse_transcript_alignment_info($cdna_genome_gff3);
+
+    ## parse ORFs on cDNAs
+
+    my $gene_obj_indexer_href = {};
+    ## associate gene identifiers with contig id's.
+    my $contig_to_gene_list_href = &GFF3_utils::index_GFF3_gene_objs($cdna_orfs_gff3, $gene_obj_indexer_href);
+
+    foreach my $asmbl_id (sort keys %$contig_to_gene_list_href) {
+    
+        my @gene_ids = @{$contig_to_gene_list_href->{$asmbl_id}};
+    
+        foreach my $gene_id (@gene_ids) {
+            my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id};
+            
+            my $asmbl_id = $gene_obj_ref->{asmbl_id};
+            
+            
+            ## pasa stuff
+            
+            if ($asmbl_id =~ /(S\d+)_(asmbl_\d+)/) { 
+                
+                my $subcluster = $1;
+                $asmbl_id = $2;
+            }
+            
+            my $transcript_struct = $cdna_acc_to_transcript_structure{$asmbl_id} or die "Error, no cdna struct for $asmbl_id";
+            
+            #print Dumper($transcript_struct) . $gene_obj_ref->toString();
+
+            eval {
+                my $new_orf_gene = &place_orf_in_cdna_alignment_context($transcript_struct, $gene_obj_ref, \%cdna_seq_lengths);
+                
+                if ($new_orf_gene) {
+                    
+                    my $orf_count = $orf_counter{$asmbl_id}++;
+                    $new_orf_gene->{asmbl_id} = $transcript_struct->{contig};
+                    #$new_orf_gene->{TU_feat_name} = "t.$asmbl_id.$orf_count";
+                    #$new_orf_gene->{Model_feat_name} = "m.$asmbl_id.$orf_count";
+                    $new_orf_gene->{com_name} = "ORF";
+                    
+                    $new_orf_gene->{TU_feat_name} = $gene_id;
+                    $new_orf_gene->{Model_feat_name} = $gene_obj_ref->{Model_feat_name};
+                                        
+
+                    print $new_orf_gene->to_GFF3_format(source => "transdecoder") . "\n";
+                    
+                }
+            };
+
+            if ($@) {
+                
+                print STDERR "Error occurred.\n";
+                
+                print STDERR Dumper($transcript_struct);
+                print STDERR $gene_obj_ref->toString();
+                print STDERR "$@";
+                die;
+            }
+            
+        }
+    }
+
+    exit(0);
+
+}
+
+####
+sub parse_transcript_alignment_info {
+    my ($cdna_align_gff3) = @_;
+
+    my %cdna_alignments;
+
+    open (my $fh, $cdna_align_gff3) or die "Error, cannot open file $cdna_align_gff3";
+    while (<$fh>) {
+        unless (/\w/) { next; }
+
+        my @x = split(/\t/);
+        my $contig = $x[0];
+        my $lend = $x[3];
+        my $rend = $x[4];
+        my $orient = $x[6];
+        my $info = $x[8];
+
+        $info =~ /Target=(\S+)/ or die "Error, cannot parse ID from $info";
+        my $asmbl = $1;
+        
+        if (my $struct = $cdna_alignments{$asmbl}) {
+            push (@{$struct->{coords}}, [$lend, $rend]);
+        }
+        else {
+            # first time
+            my $struct = { asmbl => $asmbl,
+                           contig => $contig,
+                           
+                           coords => [ 
+                                       [$lend, $rend]
+                                     ],
+                           orient => $orient,
+                                   };
+
+            $cdna_alignments{$asmbl} = $struct;
+        }
+
+    }
+
+    close $fh;
+
+    return(%cdna_alignments);
+}
+
+
+####
+sub place_orf_in_cdna_alignment_context {
+    my ($transcript_struct, $orf_gene_obj, $cdna_seq_lengths_href) = @_;
+
+    my $trans_seq_length = $cdna_seq_lengths_href->{ $transcript_struct->{asmbl} } or die "Error, no length for " . Dumper($transcript_struct);
+    
+
+
+    ## unwrap the gene
+    my @cds_coords;
+    my $orf_orient = $orf_gene_obj->get_orientation();
+        
+    foreach my $exon ($orf_gene_obj->get_exons()) {
+        
+        if (my $cds_exon = $exon->get_CDS_obj()) {
+
+            my ($lend, $rend) = sort {$a<=>$b} $cds_exon->get_coords();
+            push (@cds_coords, [$lend, $rend]);
+        }
+    }
+
+    @cds_coords = sort {$a->[0]<=>$b->[0]} @cds_coords;
+
+    my $cds_span_lend = $cds_coords[0]->[0];
+    my $cds_span_rend = $cds_coords[$#cds_coords]->[1];
+    
+    if ($cds_span_rend > $trans_seq_length) {
+        $cds_span_rend = $trans_seq_length;
+    }
+    
+    
+    my @exon_coords = @{$transcript_struct->{coords}};
+    @exon_coords = sort {$a->[0]<=>$b->[0]} @exon_coords;
+    my $trans_orient = $transcript_struct->{orient};
+
+    ## examine each potential context of orf in alignment.
+    
+    my ($cds_genome_lend, $cds_genome_rend);
+    my $transcribed_orient;
+
+    if ($orf_orient eq '+') {
+
+
+        if ($trans_orient eq '+') { 
+
+            $cds_genome_lend = &from_cdna_lend($cds_span_lend, \@exon_coords);
+            $cds_genome_rend = &from_cdna_lend($cds_span_rend, \@exon_coords);
+            $transcribed_orient = '+';
+
+        }
+    
+        elsif ($trans_orient eq '-') {
+            
+            $cds_genome_lend = &from_cdna_rend($cds_span_rend, \@exon_coords);
+            $cds_genome_rend = &from_cdna_rend($cds_span_lend, \@exon_coords);
+            $transcribed_orient = '-';
+
+        }
+        
+    }
+    
+    else {
+        ## orf orient is '-'
+        if (scalar(@exon_coords) > 1) {
+            # any correct ORF should be in the '+' orientation here.... must be a false positive orf or transcript structure is wrong
+            $WARNING_COUNT++;
+            print STDERR "Warning [$WARNING_COUNT], shouldn't have a minus-strand ORF on a spliced transcript structure. Skipping entry.\n";
+            
+            return undef;
+        }
+        
+        if ($trans_orient eq '+') { 
+            
+            
+            $cds_genome_lend = &from_cdna_lend($cds_span_rend, \@exon_coords);
+            $cds_genome_rend = &from_cdna_lend($cds_span_lend, \@exon_coords);
+            $transcribed_orient = '-';
+            
+        }
+        
+        elsif ($trans_orient eq '-') {
+            
+            $cds_genome_lend = &from_cdna_rend($cds_span_rend, \@exon_coords);
+            $cds_genome_rend = &from_cdna_rend($cds_span_lend, \@exon_coords);
+            $transcribed_orient = '+';
+        }
+        
+
+        
+    }
+    
+    my $new_gene_obj = new Gene_obj();
+    $new_gene_obj->build_gene_obj_exons_n_cds_range(\@exon_coords, $cds_genome_lend, $cds_genome_rend, $transcribed_orient);
+    
+    return ($new_gene_obj);
+}
+
+
+####
+sub from_cdna_lend {
+    my ($pt, $coords_aref) = @_;
+
+    my $lend_accrue = 0;
+
+    my @coords = sort {$a->[0]<=>$b->[0]} @$coords_aref;
+
+    foreach my $coordset (@coords) {
+        my ($lend, $rend) = @$coordset;
+
+        my $seg_len = $rend - $lend + 1;
+
+        my $rend_accrue = $lend_accrue + $seg_len;
+        $lend_accrue++;
+
+        
+        if ($pt >= $lend_accrue && $pt <= $rend_accrue) {
+            
+            my $pos = $lend + ($pt - $lend_accrue);
+            return($pos);
+        }
+        
+        $lend_accrue = $rend_accrue;
+    }
+    
+
+    die "Error, couldn't localize pt $pt within coordsets: " . Dumper($coords_aref);
+
+    return;
+}
+
+####
+sub from_cdna_rend {
+    my ($pt, $coords_aref) = @_;
+    
+    my $lend_accrue = 0;
+    
+    my @coords = reverse sort {$a->[0]<=>$b->[0]} @$coords_aref;
+    
+    foreach my $coordset (@coords) {
+        my ($lend, $rend) = @$coordset;
+        
+        my $seg_len = $rend - $lend + 1;
+        
+        my $rend_accrue = $lend_accrue + $seg_len;
+        $lend_accrue++;
+        
+        
+        if ($pt >= $lend_accrue && $pt <= $rend_accrue) {
+            
+            my $pos = $rend - ($pt - $lend_accrue);
+            return($pos);
+        }
+                
+        $lend_accrue = $rend_accrue;
+    }
+    
+    
+    die "Error, couldn't localize pt $pt within coordsets: " . Dumper($coords_aref);
+    
+    return;
+}
+
+####
+sub parse_cdna_seq_lengths {
+    my ($fasta_file) = @_;
+
+    my %seq_lengths;
+
+    my $fasta_reader = new Fasta_reader($fasta_file);
+    while (my $seq_obj = $fasta_reader->next()) {
+
+        my $acc = $seq_obj->get_accession();
+        
+        my $asmbl = $acc;
+        
+        if ($acc =~ /(asmbl_\d+)/) {
+            # pasa stuff
+            $asmbl = $1;
+        }
+        
+        my $sequence = $seq_obj->get_sequence();
+
+        $seq_lengths{$asmbl} = length($sequence);
+    }
+    
+    return(%seq_lengths);
+}
+
diff --git a/util/compute_base_probs.pl b/util/compute_base_probs.pl
new file mode 100755
index 0000000..cc476ce
--- /dev/null
+++ b/util/compute_base_probs.pl
@@ -0,0 +1,67 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Fasta_reader;
+use Nuc_translator;
+
+my $usage = "usage: $0 transcripts_file [top_strand_only]\n\n";
+
+my $transcripts_file = $ARGV[0] or die $usage;
+my $top_strand_only_flag = $ARGV[1] || 0;
+
+main: {
+
+    my %base_counter;
+
+    my $fasta_reader = new Fasta_reader($transcripts_file);
+    while (my $seq_obj = $fasta_reader->next()) {
+
+        my $sequence = uc $seq_obj->get_sequence();
+
+        &count_bases($sequence, \%base_counter);
+
+        unless ($top_strand_only_flag) {
+            $sequence = &reverse_complement($sequence);
+            &count_bases($sequence, \%base_counter);
+        }
+        
+    }
+
+    
+    my $sum = 0;
+    foreach my $count (values %base_counter) {
+        $sum += $count;
+    }
+
+    foreach my $base (sort keys %base_counter) {
+        
+        my $count = $base_counter{$base};
+
+        my $ratio = $count/$sum;
+        
+        print join("\t", $base, $count, sprintf("%.3f", $ratio)) . "\n";
+    }
+    
+
+    exit(0);
+}
+
+####
+sub count_bases {
+    my ($sequence, $base_counter_href) = @_;
+
+    my @chars = split(//, $sequence);
+    
+    foreach my $char (@chars) {
+        if ($char =~ /[GATC]/) {
+            $base_counter_href->{$char}++;
+        }
+    }
+    
+    return;
+}
+
diff --git a/util/cufflinks_gtf_genome_to_cdna_fasta.pl b/util/cufflinks_gtf_genome_to_cdna_fasta.pl
new file mode 100755
index 0000000..c90b460
--- /dev/null
+++ b/util/cufflinks_gtf_genome_to_cdna_fasta.pl
@@ -0,0 +1,113 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Gene_obj;
+use Fasta_reader;
+
+my $usage = "usage: $0 cufflinks.gtf genome.fasta\n\n";
+
+my $cufflinks_gtf = $ARGV[0] or die $usage;
+my $genome = $ARGV[1] or die $usage;
+
+main: {
+
+
+	print STDERR "-parsing cufflinks output: $cufflinks_gtf\n";
+	my %genome_trans_to_coords;
+	
+	open (my $fh, $cufflinks_gtf) or die "Error, cannot open file $cufflinks_gtf";
+	while (<$fh>) {
+		chomp;
+		
+		unless (/\w/) { next; }
+		
+		my @x = split(/\t/);
+		
+		my $scaff = $x[0];
+		my $type = $x[2];
+		my $lend = $x[3];
+		my $rend = $x[4];
+
+		my $orient = $x[6];
+		
+		my $info = $x[8];
+		
+		unless ($type eq 'exon') { next; }
+
+		my @parts = split(/;/, $info);
+		my %atts;
+		foreach my $part (@parts) {
+			$part =~ s/^\s+|\s+$//;
+			$part =~ s/\"//g;
+			my ($att, $val) = split(/\s+/, $part);
+			unless (defined $att) { next; }
+            
+			if (exists $atts{$att}) {
+				die "Error, already defined attribute $att in $_";
+			}
+			
+			$atts{$att} = $val;
+		}
+
+		my $gene_id = $atts{gene_id} or die "Error, no gene_id at $_";
+		my $trans_id = $atts{transcript_id} or die "Error, no trans_id at $_";
+		
+		my ($end5, $end3) = ($orient eq '+') ? ($lend, $rend) : ($rend, $lend);
+
+		$genome_trans_to_coords{$scaff}->{$gene_id}->{$trans_id}->{$end5} = $end3;
+
+	}
+
+
+    ## get genome sequence
+    
+    print STDERR "-parsing genome fasta: $genome\n";
+    my $fasta_reader = new Fasta_reader($genome);
+    my %genome_seqs = $fasta_reader->retrieve_all_seqs_hash();
+    print STDERR "-done parsing genome.\n";
+    
+
+
+	## Output genes in gff3 format:
+
+	foreach my $scaff (sort keys %genome_trans_to_coords) {
+
+        print STDERR "// processing $scaff\n";
+
+        my $genome_seq = $genome_seqs{$scaff} or die "Error, no seq for $scaff";
+
+		my $genes_href = $genome_trans_to_coords{$scaff};
+
+		foreach my $gene_id (keys %$genes_href) {
+
+			my $trans_href = $genes_href->{$gene_id};
+
+			foreach my $trans_id (keys %$trans_href) {
+
+				my $coords_href = $trans_href->{$trans_id};
+
+				my $gene_obj = new Gene_obj();
+
+				$gene_obj->{TU_feat_name} = $gene_id;
+				$gene_obj->{Model_feat_name} = $trans_id;
+				$gene_obj->{com_name} = "cufflinks $gene_id $trans_id";
+				
+				$gene_obj->{asmbl_id} = $scaff;
+				
+				$gene_obj->populate_gene_object($coords_href, $coords_href);
+			
+                my $cdna_seq = $gene_obj->create_cDNA_sequence(\$genome_seq);
+				
+				print ">$trans_id $gene_id\n$cdna_seq\n";
+			}
+		}
+	}
+    
+
+	exit(0);
+}
+
diff --git a/util/cufflinks_gtf_to_alignment_gff3.pl b/util/cufflinks_gtf_to_alignment_gff3.pl
new file mode 100755
index 0000000..7d15125
--- /dev/null
+++ b/util/cufflinks_gtf_to_alignment_gff3.pl
@@ -0,0 +1,97 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Gene_obj;
+
+my $usage = "usage: $0 cufflinks.gtf\n\n";
+
+my $cufflinks_gtf = $ARGV[0] or die $usage;
+
+
+main: {
+	
+	my %genome_trans_to_coords;
+	
+	open (my $fh, $cufflinks_gtf) or die "Error, cannot open file $cufflinks_gtf";
+	while (<$fh>) {
+		chomp;
+		
+		unless (/\w/) { next; }
+		
+		my @x = split(/\t/);
+		
+		my $scaff = $x[0];
+		my $type = $x[2];
+		my $lend = $x[3];
+		my $rend = $x[4];
+
+		my $orient = $x[6];
+		
+		my $info = $x[8];
+		
+		unless ($type eq 'exon') { next; }
+
+		my @parts = split(/;/, $info);
+		my %atts;
+		foreach my $part (@parts) {
+			$part =~ s/^\s+|\s+$//;
+			$part =~ s/\"//g;
+			my ($att, $val) = split(/\s+/, $part);
+			unless (defined $att) { next; }
+            
+			if (exists $atts{$att}) {
+				die "Error, already defined attribute $att in $_";
+			}
+			
+			$atts{$att} = $val;
+		}
+
+		my $gene_id = $atts{gene_id} or die "Error, no gene_id at $_";
+		my $trans_id = $atts{transcript_id} or die "Error, no trans_id at $_";
+		
+		my ($end5, $end3) = ($orient eq '+') ? ($lend, $rend) : ($rend, $lend);
+
+		$genome_trans_to_coords{$scaff}->{$gene_id}->{$trans_id}->{$end5} = $end3;
+
+	}
+
+
+	## Output genes in gff3 format:
+
+	foreach my $scaff (sort keys %genome_trans_to_coords) {
+
+		my $genes_href = $genome_trans_to_coords{$scaff};
+
+		foreach my $gene_id (keys %$genes_href) {
+
+			my $trans_href = $genes_href->{$gene_id};
+
+			foreach my $trans_id (keys %$trans_href) {
+
+				my $coords_href = $trans_href->{$trans_id};
+
+				my $gene_obj = new Gene_obj();
+
+				$gene_obj->{TU_feat_name} = $gene_id;
+				$gene_obj->{Model_feat_name} = $trans_id;
+				$gene_obj->{com_name} = "cufflinks $gene_id $trans_id";
+				
+				$gene_obj->{asmbl_id} = $scaff;
+				
+				$gene_obj->populate_gene_object($coords_href, $coords_href);
+                
+				print $gene_obj->to_alignment_GFF3_format($trans_id, "$trans_id", "Cufflinks");
+				
+				print "\n";
+			}
+		}
+	}
+
+
+	exit(0);
+}
+
diff --git a/util/cufflinks_gtf_to_bed.pl b/util/cufflinks_gtf_to_bed.pl
new file mode 100755
index 0000000..28547ba
--- /dev/null
+++ b/util/cufflinks_gtf_to_bed.pl
@@ -0,0 +1,98 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use File::Basename;
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Gene_obj;
+
+my $usage = "usage: $0 cufflinks.gtf\n\n";
+
+my $cufflinks_gtf = $ARGV[0] or die $usage;
+
+
+main: {
+	
+	my %genome_trans_to_coords;
+	
+	open (my $fh, $cufflinks_gtf) or die "Error, cannot open file $cufflinks_gtf";
+	while (<$fh>) {
+		chomp;
+		
+		unless (/\w/) { next; }
+		
+		my @x = split(/\t/);
+		
+		my $scaff = $x[0];
+		my $type = $x[2];
+		my $lend = $x[3];
+		my $rend = $x[4];
+
+		my $orient = $x[6];
+		
+		my $info = $x[8];
+		
+		unless ($type eq 'exon') { next; }
+
+		my @parts = split(/;/, $info);
+		my %atts;
+		foreach my $part (@parts) {
+			$part =~ s/^\s+|\s+$//;
+			$part =~ s/\"//g;
+			my ($att, $val) = split(/\s+/, $part);
+			
+			if (exists $atts{$att}) {
+				die "Error, already defined attribute $att in $_";
+			}
+			
+			$atts{$att} = $val;
+		}
+
+		my $gene_id = $atts{gene_id} or die "Error, no gene_id at $_";
+		my $trans_id = $atts{transcript_id} or die "Error, no trans_id at $_";
+		
+		my ($end5, $end3) = ($orient eq '+') ? ($lend, $rend) : ($rend, $lend);
+
+		$genome_trans_to_coords{$scaff}->{$gene_id}->{$trans_id}->{$end5} = $end3;
+
+	}
+
+
+	## Output genes in gff3 format:
+
+    print "track name=\'" . basename($cufflinks_gtf) . "\'\n";
+    
+	foreach my $scaff (sort keys %genome_trans_to_coords) {
+
+		my $genes_href = $genome_trans_to_coords{$scaff};
+
+		foreach my $gene_id (keys %$genes_href) {
+
+			my $trans_href = $genes_href->{$gene_id};
+
+			foreach my $trans_id (keys %$trans_href) {
+
+				my $coords_href = $trans_href->{$trans_id};
+
+				my $gene_obj = new Gene_obj();
+
+				$gene_obj->{TU_feat_name} = $gene_id;
+				$gene_obj->{Model_feat_name} = $trans_id;
+				$gene_obj->{com_name} = "$gene_id $trans_id";
+				
+				$gene_obj->{asmbl_id} = $scaff;
+				
+				$gene_obj->populate_gene_object($coords_href, $coords_href);
+			
+				print $gene_obj->to_BED_format();
+								
+			}
+		}
+	}
+
+
+	exit(0);
+}
+
diff --git a/util/ffindex_resume.pl b/util/ffindex_resume.pl
new file mode 100755
index 0000000..1ea8c9c
--- /dev/null
+++ b/util/ffindex_resume.pl
@@ -0,0 +1,51 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $input_idx = shift;
+my $done_input_idx = shift||die ("input IDX, basename of done IDX\n");
+my $done_input = $done_input_idx;
+$done_input =~s/\.idx$/.db/;
+my $counter=int(0);
+my (%hash_done,%hash_all);
+
+open (IN,"$input_idx") ||die;
+while (my $ln=<IN>){
+        $ln=~/^(\d+)\s/ ||next;
+        $hash_all{$1} = $ln;
+}
+close IN;
+
+
+my @files_done = glob("./$done_input_idx*");
+foreach my $done (@files_done){
+ open (IN,$done);
+  while (my $ln=<IN>){
+    $ln=~/^(\d+)\s/ ||next;
+    next if $ln=~/\b1$/;
+    $hash_done{$1}=1;
+  }
+ close IN;
+}
+
+print "All: ".scalar(keys(%hash_all))."\n";
+print "Done: ".scalar(keys(%hash_done))."\n";
+
+open (OUT,">".$input_idx.".notdone");
+
+foreach my $i (sort {$a <=> $b} keys %hash_all){
+ next if $hash_done{$i};
+ $counter++;
+ print OUT $hash_all{$i};
+}
+
+close OUT;
+
+print "Found $counter unfinished\n";
+if ($counter==0){
+   unlink($input_idx.".notdone");
+   system("cat $done_input | tr -d '\000' > $done_input.txt");
+   exit;
+}
+
diff --git a/util/gene_list_to_gff.pl b/util/gene_list_to_gff.pl
new file mode 100755
index 0000000..d603abe
--- /dev/null
+++ b/util/gene_list_to_gff.pl
@@ -0,0 +1,45 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Gene_obj;
+use Gene_obj_indexer;
+use GFF3_utils;
+use Carp;
+
+$|++;
+
+my $usage = "\n\nusage: $0 gene_ID_list_file gene_db.inx_file  \n\nNote, you must first run index_gff3_files.pl\n\n";
+
+my $gene_ID_list = $ARGV[0] or die $usage;
+my $inx_file = $ARGV[1] or die $usage;
+
+my $gene_obj_indexer = new Gene_obj_indexer( { "use" => "$inx_file" } );
+
+open (my $fh, $gene_ID_list) or die $!;
+while (<$fh>) {
+    unless (/\w/) { next;}
+    chomp;
+	my ($gene_id, $com_name) = split (/\t/);
+    $gene_id =~ s/\s+//;
+    
+    my $gene_obj = $gene_obj_indexer->get_gene($gene_id);
+	
+	if (defined ($com_name) && $com_name =~ /\w/) {
+		$gene_obj->{com_name} = $com_name;
+	}
+
+    print $gene_obj->to_GFF3_format(source => "transdecoder") . "\n";
+    
+}
+close $fh;
+
+
+exit(0);
+
+
+
+
diff --git a/util/get_top_longest_fasta_entries.pl b/util/get_top_longest_fasta_entries.pl
new file mode 100755
index 0000000..26b778c
--- /dev/null
+++ b/util/get_top_longest_fasta_entries.pl
@@ -0,0 +1,48 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Fasta_reader;
+
+my $usage = "usage: $0 file.fasta numTopLongest\n\n";
+
+my $file = $ARGV[0] or die $usage;
+my $num_longest = $ARGV[1] or die $usage;
+
+main: {
+
+	my @entries;
+
+	my $fasta_reader = new Fasta_reader($file);
+	while (my $seq_obj = $fasta_reader->next()) {
+
+		my $seq = $seq_obj->get_sequence();
+		my $len = length($seq);
+		
+		push (@entries, [$seq_obj, $len]);
+
+	}
+
+	@entries = reverse sort {$a->[1]<=>$b->[1]} @entries;
+	
+	my $counter = 0;
+	foreach my $entry (@entries) {
+		
+		my ($seq_obj, $num) = @$entry;
+		
+		print $seq_obj->get_FASTA_format();
+		
+		$counter++;
+
+		if ($counter >= $num_longest) {
+			last;
+		}
+
+	}
+		
+	exit(0);
+}
+
diff --git a/util/gff3_file_to_bed.pl b/util/gff3_file_to_bed.pl
new file mode 100755
index 0000000..99c7114
--- /dev/null
+++ b/util/gff3_file_to_bed.pl
@@ -0,0 +1,43 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Gene_obj;
+use GFF3_utils;
+use Carp;
+use Nuc_translator;
+use File::Basename;
+
+my $usage = "\n\nusage: $0 gff3_file\n\n";
+
+my $gff3_file = $ARGV[0] or die $usage;
+
+my $gene_obj_indexer_href = {};
+
+## associate gene identifiers with contig id's.
+my $contig_to_gene_list_href = &GFF3_utils::index_GFF3_gene_objs($gff3_file, $gene_obj_indexer_href);
+
+print "track name=\'" . basename($gff3_file) . "\'\n";
+
+foreach my $asmbl_id (sort keys %$contig_to_gene_list_href) {
+    
+    my @gene_ids = @{$contig_to_gene_list_href->{$asmbl_id}};
+    
+    foreach my $gene_id (@gene_ids) {
+        		
+		my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id};
+		
+		foreach my $gene ($gene_obj_ref, $gene_obj_ref->get_additional_isoforms()) {
+
+			my $bed = $gene->to_BED_format();
+
+			print $bed;
+		}
+	}
+}
+
+
+exit(0);
+
diff --git a/util/gff3_file_to_proteins.pl b/util/gff3_file_to_proteins.pl
new file mode 100755
index 0000000..cf83bdf
--- /dev/null
+++ b/util/gff3_file_to_proteins.pl
@@ -0,0 +1,164 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Gene_obj;
+use Fasta_reader;
+use GFF3_utils;
+use Carp;
+use Nuc_translator;
+
+my $usage = "\n\nusage: $0 gff3_file genome_db [prot|CDS|cDNA|gene,default=prot] [flank=0]\n\n";
+
+my $gff3_file = $ARGV[0] or die $usage;
+my $fasta_db = $ARGV[1] or die $usage;
+my $seq_type = $ARGV[2] || "prot";
+my $flank = $ARGV[3] || 0;
+
+my ($upstream_flank, $downstream_flank) = (0,0);
+
+if ($flank) {
+	if ($flank =~ /:/) {
+		($upstream_flank, $downstream_flank) = split (/:/, $flank);
+	}
+	else {
+		($upstream_flank, $downstream_flank) = ($flank, $flank);
+	}
+}
+
+if ($upstream_flank < 0 || $downstream_flank < 0) {
+	die $usage;
+}
+
+
+
+unless ($seq_type =~ /^(prot|CDS|cDNA|gene)$/) {
+    die "Error, don't understand sequence type [$seq_type]\n\n$usage";
+}
+
+
+## read genome
+my $fasta_reader = new Fasta_reader($fasta_db);
+my %genome = $fasta_reader->retrieve_all_seqs_hash();
+
+
+my $gene_obj_indexer_href = {};
+
+## associate gene identifiers with contig id's.
+my $contig_to_gene_list_href = &GFF3_utils::index_GFF3_gene_objs($gff3_file, $gene_obj_indexer_href);
+
+foreach my $asmbl_id (sort keys %$contig_to_gene_list_href) {
+    
+    my $genome_seq = $genome{$asmbl_id} or die "Error, no sequence for $asmbl_id";
+    
+    my @gene_ids = @{$contig_to_gene_list_href->{$asmbl_id}};
+    
+    foreach my $gene_id (@gene_ids) {
+        my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id};
+		
+        my %params;
+        if ($seq_type eq "gene") {
+            $params{unspliced_transcript} = 1;
+        }
+        
+        $gene_obj_ref->create_all_sequence_types(\$genome_seq, %params);
+        
+		my $counter = 0;
+        foreach my $isoform ($gene_obj_ref, $gene_obj_ref->get_additional_isoforms()) {
+ 
+			$counter++;
+
+			my $orientation = $isoform->get_orientation();
+			my ($model_lend, $model_rend) = sort {$a<=>$b} $isoform->get_model_span();
+			my ($gene_lend, $gene_rend) = sort {$a<=>$b} $isoform->get_gene_span();
+			
+            my $isoform_id = $isoform->{Model_feat_name};
+            
+            my $seq = "";
+
+            if ($seq_type eq "prot") {
+                $seq = $isoform->get_protein_sequence();
+            }
+            elsif ($seq_type eq "CDS") {
+                $seq = $isoform->get_CDS_sequence();
+				if ($upstream_flank || $downstream_flank) {
+					$seq = &add_flank($seq, $upstream_flank, $downstream_flank, $model_lend, $model_rend, $orientation, \$genome_seq);
+				}
+			}
+            elsif ($seq_type eq "cDNA") {
+                $seq = $isoform->get_cDNA_sequence();
+				if ($upstream_flank || $downstream_flank) {
+					$seq = &add_flank($seq, $upstream_flank, $downstream_flank, $gene_lend, $gene_rend, $orientation, \$genome_seq);
+				}
+			}
+            elsif ($seq_type eq "gene" && $counter == 1) {
+                $seq = $isoform->get_gene_sequence();
+				if ($upstream_flank || $downstream_flank) {
+					$seq = &add_flank($seq, $upstream_flank, $downstream_flank, $gene_lend, $gene_rend, $orientation, \$genome_seq);
+				}
+			}
+            
+            unless ($seq) {
+                print STDERR "-warning, no $seq_type sequence for $isoform_id\n";
+                next;
+            }
+
+            $seq =~ s/(\S{60})/$1\n/g; # make fasta format
+            chomp $seq;
+            
+            my $com_name = $isoform->{com_name} || "";
+            
+			if ($com_name eq $isoform_id) { $com_name = ""; } # no sense in repeating it
+
+			my $locus = $isoform->{pub_locus};
+			my $model_locus = $isoform->{model_pub_locus};
+			
+			my $locus_string = "";
+			if ($model_locus) {
+                $locus_string .= $model_locus;
+			}
+			if ($locus) {
+				$locus_string .= " $locus";
+			}
+			if ($locus_string) {
+				$locus_string .= " "; # add spacer
+			}
+            
+            #if ($seq_type eq 'prot' || $seq_type eq 'CDS') {  # this was a bad idea, just use the original id.
+            #    $isoform_id = "cds.$isoform_id";
+            #}
+            
+            print ">$isoform_id $gene_id $locus_string $com_name $asmbl_id:$model_lend-$model_rend($orientation)\n$seq\n";
+        }
+    }
+}
+
+
+exit(0);
+
+
+####
+sub add_flank {
+	my ($seq, $upstream_flank, $downstream_flank, $lend, $rend, $orientation, $genome_seq_ref) = @_;
+	
+	my $far_left = ($orientation eq '+') ? $lend - $upstream_flank : $lend - $downstream_flank;
+	
+	if ($far_left < 1) { $far_left = 1; }
+	
+	my $flank_right = ($orientation eq '+') ? $downstream_flank : $upstream_flank;
+
+	my $left_seq = substr($$genome_seq_ref, $far_left - 1, $lend - $far_left);
+
+	my $right_seq = substr($$genome_seq_ref, $rend, $flank_right);
+	
+	if ($orientation eq '+') {
+		return (lc($left_seq) . uc($seq) . lc($right_seq));
+	}
+	else {
+		return (lc(&reverse_complement($right_seq)) . uc($seq) . lc(&reverse_complement($left_seq)));
+	}
+}
+
+
diff --git a/util/index_gff3_files_by_isoform.pl b/util/index_gff3_files_by_isoform.pl
new file mode 100755
index 0000000..8a7f8b8
--- /dev/null
+++ b/util/index_gff3_files_by_isoform.pl
@@ -0,0 +1,74 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Gene_obj;
+use Gene_obj_indexer;
+use GFF3_utils;
+use Carp;
+
+$|++;
+
+my $usage = "\n\nusage: $0 gff3_file [ gff3_file, ... ]\n\n"
+    . "\tgenes are indexed by model feat_name.\n\n";
+
+
+my @gff3_files = @ARGV;
+unless (@gff3_files) { die $usage; }
+
+my $index_file = "gene_structures.inx";
+if (scalar @gff3_files == 1) {
+    $index_file = $gff3_files[0] . ".inx";
+}
+
+my $gene_obj_indexer = new Gene_obj_indexer( { "create" => $index_file } );
+
+my %seen; #track gene_ids already visited
+
+foreach my $gff3_file (@gff3_files) {
+
+#    print STDERR "// indexing $gff3_file\n";
+    
+    ## associate gene identifiers with contig id's.
+    my $temp_inx = "tmp.inx";
+    my $temp_gene_indexer = new Gene_obj_indexer( { "create" => $temp_inx } );
+    
+    my $asmbl_id_to_gene_list_href = &GFF3_utils::index_GFF3_gene_objs($gff3_file, $temp_gene_indexer);
+    
+    
+    foreach my $asmbl_id (sort keys %$asmbl_id_to_gene_list_href) {
+        
+        my @gene_ids = @{$asmbl_id_to_gene_list_href->{$asmbl_id}};
+        
+        #print "ASMBL: $asmbl_id, gene_ids: @gene_ids\n";
+        
+        foreach my $gene_id (@gene_ids) {
+            
+            if ($seen{$gene_id}) {
+                croak "Error, already stored gene_id: [$gene_id], not allowed to have the same gene id multiple GFF3 files.\n";
+            }
+            $seen{$gene_id} = 1;
+            
+            my $gene_obj_ref = $temp_gene_indexer->get_gene($gene_id);
+            
+            foreach my $gene_obj ($gene_obj_ref, $gene_obj_ref->get_additional_isoforms()) {
+                
+                $gene_obj->delete_isoforms(); # unbundle the model object!
+                
+                my $model_id = $gene_obj->{Model_feat_name};
+                $gene_obj_indexer->store_gene($model_id, $gene_obj);
+                print STDERR "\r Indexed $model_id ";
+            }
+        }
+        
+    }
+    print STDERR "\n";
+
+    unlink ($temp_inx);
+}
+
+
+exit(0);
+
diff --git a/util/nr_ORFs_gff3.pl b/util/nr_ORFs_gff3.pl
new file mode 100755
index 0000000..9cc94c7
--- /dev/null
+++ b/util/nr_ORFs_gff3.pl
@@ -0,0 +1,75 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Gene_obj;
+use Gene_obj_indexer;
+use GFF3_utils;
+use Carp;
+
+my $usage = "\n\nusage: $0 gff3_file\n\n";
+
+my $gff3_file = $ARGV[0] or die $usage;
+
+
+main: {
+
+    my $gene_obj_indexer_href = {};
+    
+    my $asmbl_id_to_gene_list_href = &GFF3_utils::index_GFF3_gene_objs($gff3_file, $gene_obj_indexer_href);
+
+    my %seen;
+
+    
+    foreach my $asmbl_id (sort keys %$asmbl_id_to_gene_list_href) {
+        
+        my @gene_ids = @{$asmbl_id_to_gene_list_href->{$asmbl_id}};
+        
+        #print "ASMBL: $asmbl_id, gene_ids: @gene_ids\n";
+        my @gene_entries;
+        
+        foreach my $gene_id (@gene_ids) {
+            
+            my $gene_obj = $gene_obj_indexer_href->{$gene_id};
+            
+            my $cds_token = &get_CDS_token($gene_obj);
+            if (! $seen{$cds_token}) {
+                
+                print $gene_obj->to_GFF3_format(source => "transdecoder") . "\n";
+                
+                $seen{$cds_token} = 1;
+                
+            }
+            else {
+                print STDERR "-ignoring entry $cds_token, already represented by another transcript\n";
+            }
+            
+        }
+    }
+    
+    
+    exit(0);
+
+}
+
+
+
+####
+sub get_CDS_token {
+    my ($gene_obj) = @_;
+
+    my $cds_text = $gene_obj->{asmbl_id};
+    
+    my @exons = $gene_obj->get_exons();
+    foreach my $exon (@exons) {
+        if (my $cds_obj = $exon->get_CDS_obj()) {
+            my ($cds_end5, $cds_end3) = $cds_obj->get_coords();
+            $cds_text .= ":$cds_end5-$cds_end3";
+        }
+    }
+
+    return($cds_text);
+}
+    
diff --git a/util/pfam_mpi.pbs b/util/pfam_mpi.pbs
new file mode 100755
index 0000000..d7d242f
--- /dev/null
+++ b/util/pfam_mpi.pbs
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# These are example SCRIPTs to run PFAM with MPI ON PBS. Minor changes would be needed for LSF etc
+# executable for mpirun, e.g. from OpenMPI
+MPIRUN_EXEC=`which mpirun`
+# these arguments are valid for OpenMPI, change them for Intel's
+MPIRUN_ARGS="-gmca mpi_warn_on_fork 0 -cpus-per-proc 1 -np $2 -machinefile workers.$PBS_JOBID.mpi"
+
+#executable of hhmscan
+HMMSCAN_EXEC=`which hmmscan`
+
+#your PFAM database
+PFAMDB=/home/pap056/30day/databases/pfam/Pfam-AB.hmm.bin
+
+# no need to change the following
+
+export OMP_NUM_THREADS=1
+# the PBS_O_WORKDIR is where you launched the script from. Feel free to change this if you need
+# to launch it from a difference directory
+cd $PBS_O_WORKDIR
+cat ${PBS_NODEFILE} > workers.$PBS_JOBID.mpi
+$MPIRUN_EXEC $MPIRUN_ARGS ffindex_apply_mpi \
+ -d "$1"_out2.db \
+ -i "$1"_out2.db.idx \
+ $1 \
+ $1.idx \
+ -- $HMMSCAN_EXEC -o /dev/null --cpu 1 --noali --cut_nc --acc --notextw --domtblout /dev/stdout $PFAMDB -
diff --git a/util/pfam_runner.pl b/util/pfam_runner.pl
new file mode 100755
index 0000000..3fdb7a2
--- /dev/null
+++ b/util/pfam_runner.pl
@@ -0,0 +1,309 @@
+#!/usr/bin/env perl
+
+use FindBin;
+use strict;
+use warnings;
+use Getopt::Long qw(:config no_ignore_case bundling pass_through);
+use Data::Dumper;
+use List::Util qw (min max);
+use File::Basename;
+
+use lib ("$FindBin::RealBin/PerlLib");
+
+use POSIX qw(ceil);
+use Gene_obj;
+use Nuc_translator;
+use Fasta_reader;
+use Longest_orf;
+
+
+
+my $UTIL_DIR = "$FindBin::RealBin/util";
+
+my $help;
+my $workdir;
+my $verbose;
+my ($reuse,$pfam_out);
+my $CPU = 2;
+
+my $usage =  <<_EOH_;
+
+######################################## Options ###################################################################################
+#
+###############
+# ** Required:
+###############
+
+# --pep <string>                         peptide files
+#
+# --pfam_db <string>                 /path/to/pfam_db.hmm to search 
+#                                        using hmmscan (which should be accessible via your PATH setting)
+#
+################
+# ** Optional:
+################ 
+#
+#
+# --reuse                                If this option is given, any existing files are not overwritten but reused
+#
+# 
+# --pfam_out|o <string>                    You can also pre-run the pfam searches if --reuse is set. In that case, 
+#                                        --pfam_out is the output of hhmscan --domtblout using --noali --cut_nc --acc --notextw
+#
+# --prepare_pfam                         Prepare data for PFAM search and then quit (for running PFAM on HPC/computing cluster
+#                                         with or without MPI )
+#
+# --workdir                              Force temporary output directory to this directory (e.g. if --reuse is needed)
+#
+#
+# -h                                     print this option menu and quit
+# -v                                     verbose
+#
+# --CPU <int>                            number of threads to use; (default: 2)
+#
+# --MPI                                  use MPI (via ffindex_apply_mpi)
+#
+# --quiet                                send stderr to /dev/null
+#
+#
+####################################################################################################################################
+
+_EOH_
+
+    ;
+
+
+my $MPI_DEBUG = 0;
+my $pep_file;
+my $search_pfam;
+my $prepare_pfam_only = 0;
+my $MPI = 0;
+
+&GetOptions( 'pep=s' => \$pep_file,
+             'h' => \$help,
+             'v' => \$verbose,
+             'CPU=i' => \$CPU,
+             'pfam_db=s' => \$search_pfam,
+             'reuse' => \$reuse,
+             'workdir:s' => \$workdir,
+             'pfam_out|o=s' => \$pfam_out,
+             'prepare_pfam' => \$prepare_pfam_only,
+             'debug' => \$MPI_DEBUG,
+             'MPI' => \$MPI,
+             );
+
+
+
+if ($help) {
+    die $usage;
+}
+
+if (@ARGV) {
+    die "Error, don't understand options: @ARGV";
+}
+
+$|++;
+
+our $SEE = $verbose;
+
+unless ($pep_file && $search_pfam) {
+    die "$usage\n";
+}
+
+&check_for_pfam_execs if ($search_pfam);   
+
+$workdir = "transdecoder.tmp.$$" unless $workdir;
+mkdir($workdir) unless -d $workdir;
+die "Error, cannot mkdir $workdir" unless -d $workdir;
+
+unless ($pfam_out) {
+    $pfam_out = basename($pep_file) . ".transdecoder.pfam.dat";
+}
+
+
+main: {
+
+    my $parafly_cmd_file = &multithread($pep_file);
+    
+    my $parafly_cmd = "ParaFly -CPU $CPU -c $parafly_cmd_file --failed $parafly_cmd_file.failed ";
+    if (!$MPI_DEBUG) { 
+        $parafly_cmd .= " -v ";
+    }
+    else {
+        # a little more verbose
+        $parafly_cmd .= " -vv ";
+    }
+    
+    if ($prepare_pfam_only) {
+        
+        
+        print "We have prepared the $parafly_cmd_file command file for you to run hmmscan separately (e.g. on a cluster).\n";
+        print "Example for a single node with $CPU CPUs :\n\t$parafly_cmd\n\n";
+        print "After your PFAM searches are complete, then concatanate all the out.db files using this command into $pep_file.pfam.out.\n";
+        print "cat $workdir/*.out.db" 
+              . '|tr -d \'\000\' ' 
+              . "|grep -v '^#' > $pep_file.pfam.out\n\n";
+        print "Then in order to restart transdecoder use the following command (along with any other options you want):\n";
+        print "\t$0 --pep $pep_file --search_pfam $search_pfam --pfam_out $pep_file.pfam.out --reuse --workdir $workdir\n\n";
+        
+        exit(0);
+    }
+    
+    print "Processing with PFAM HMM searches...\n";
+    
+    unless ($reuse && -s $pfam_out){
+                
+        &process_cmd("$parafly_cmd");
+        
+        if (-s "$parafly_cmd_file.failed"){
+            die "Some sequences failed to be searched against PFAM. Please resolve the situation (see $parafly_cmd_file), delete $parafly_cmd_file and use --rerun and --workdir $workdir to resume.\n";
+        }
+        
+        if ($MPI) {
+            &process_cmd("cat $workdir/*.out.db" . '|tr -d "\000" | grep -v "^#" > ' . $pfam_out);
+            &process_cmd("find $workdir -name '*.out.db' -delete");
+            &process_cmd("find $workdir -name '*.out.db.idx' -delete");
+            &process_cmd("find $workdir -name 'parafly.sh*' -delete");
+           
+            unless (-s $pfam_out) {
+                die "Error, pfam results were not properly concatenated into file: $pfam_out";
+            }
+        }
+        else {
+            ## Parse regular pfam table output files:
+            &process_cmd("find $workdir/ -name '*.fa.domtbl' -exec cat {} \\\; | egrep -v '^\#' > $pfam_out");
+        }
+    }
+    
+    print STDERR "PFAM SEARCH DONE.\n\n";
+    
+    exit(0);
+
+}
+
+####
+sub process_cmd {
+	my ($cmd) = @_;
+
+	print "CMD: $cmd\n";
+	my $ret = system($cmd);
+
+	if ($ret) {
+		die "Error, cmd: $cmd died with ret $ret";
+	}
+	
+	return;
+
+}
+
+sub index_fasta(){
+   # this where ffindex would really help
+   my $fasta_file = shift;
+   &process_cmd("$UTIL_DIR/bin/ffindex_from_fasta -s $fasta_file.db $fasta_file.db.idx $fasta_file") unless -s "$fasta_file.db";
+   my $sequence_number = `wc -l < $fasta_file.db.idx`;
+   chomp($sequence_number);
+   die unless $sequence_number;
+   return $sequence_number;
+}
+
+
+# MPIrun might require expert users here.... so just use parafly and be done with it (experts can run mpirun directly)
+sub multithread(){
+    my ($protein_file) = @_;
+    print STDERR "Partitioning fasta file $protein_file\n";
+    my @fasta_files = &partition_transcript_db($protein_file);
+    my $cmd_file = "$workdir/parafly.sh";
+    unlink($cmd_file.'.completed');
+    unlink($cmd_file.'.failed');
+    open(OUT,">$cmd_file") || die; 
+    foreach my $fasta_file (@fasta_files){
+
+        if ($MPI) {
+
+            my $cmd = "hmmscan -o /dev/null --cpu 1 --noali --cut_nc --acc --notextw  --domtblout /dev/stdout $search_pfam - ";
+
+            my $sequence_number = &index_fasta($fasta_file);
+            my $pfam_out = "$fasta_file.out";
+            my $ffidx = "$UTIL_DIR/bin/ffindex_apply_mpi -d $pfam_out.db -i $pfam_out.idx $fasta_file.db $fasta_file.db.idx -- ";
+            unlink($pfam_out);
+            unlink($pfam_out.'.db');
+            unlink($pfam_out.'.idx');
+            
+            if (!$MPI_DEBUG) {
+                $ffidx .= " 2>/dev/null";
+            }
+            
+            print OUT $ffidx . " $cmd \n";
+        }
+        else {
+            ## use ParaFly / hmmscan w/o ffindex
+            my $cmd = "hmmscan -o /dev/null --cpu 1 --noali --cut_nc --acc --notextw --domtblout $fasta_file.domtbl $search_pfam $fasta_file ";
+            print OUT "$cmd\n";
+        }
+        
+    }
+    close OUT;
+    return "$cmd_file";
+}
+
+
+sub partition_transcript_db {
+    my $transcript_db = shift;
+    my $number_of_peps = `grep '>' $transcript_db | wc -l `;
+    chomp $number_of_peps;
+    
+    my $seqs_per_partition = ceil($number_of_peps/$CPU);
+    $seqs_per_partition = 1 if $seqs_per_partition < 1;
+    $seqs_per_partition = $seqs_per_partition < 5000 ?  $seqs_per_partition : 5000  ;
+    my @files;
+    my $fasta_reader = new Fasta_reader($transcript_db);
+    my $partition_counter = 0;
+    my $counter = 0;
+    my $ofh;
+    while (my $seq_obj = $fasta_reader->next()) {
+            my $fasta_entry = $seq_obj->get_FASTA_format();
+	    $fasta_entry=~s/[\*\s]+$//; #strip stop codon/empty space
+	    $fasta_entry.="\n";
+            if ($counter % $seqs_per_partition == 0) {
+                close $ofh if $ofh;
+                $partition_counter++;
+                my $outfile = "$workdir/partition.$counter.fa";
+                open ($ofh, ">$outfile") or die "Error, cannot write to outfile: $outfile";
+                push (@files, $outfile);
+            }
+            print $ofh $fasta_entry;
+            $counter++;
+    }
+    close $ofh if $ofh;
+    return(@files);
+}
+
+sub check_for_pfam_execs(){
+    
+    $ENV{PATH} .= ":$UTIL_DIR/bin"; # now can find 3rd party tools in PATH setting
+    
+    $ENV{LD_LIBRARY_PATH} .= ":$UTIL_DIR/lib64/";
+    
+    die "Error, cannot locate pfam database at: $search_pfam"  unless (-s $search_pfam);
+    my @utils = qw(hmmscan ParaFly);
+
+    if ($MPI) {
+        push (@utils, qw(ffindex_apply_mpi ffindex_from_fasta));
+    }
+    my @programs = &check_program(@utils);
+}
+
+
+
+sub check_program() {
+ my @paths;
+ foreach my $prog (@_) {
+  my $path = `which $prog`;
+  die "Error, path to a required program ($prog) cannot be found\n\n"
+    unless $path =~ /^\//;
+  chomp($path);
+  $path = readlink($path) if -l $path;
+  push( @paths, $path );
+ }
+ return @paths;
+}
\ No newline at end of file
diff --git a/util/remove_eclipsed_ORFs.pl b/util/remove_eclipsed_ORFs.pl
new file mode 100755
index 0000000..92e7fd1
--- /dev/null
+++ b/util/remove_eclipsed_ORFs.pl
@@ -0,0 +1,92 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Gene_obj;
+use Gene_obj_indexer;
+use GFF3_utils;
+use Carp;
+
+$|++;
+
+my $usage = "\n\nusage: $0 gff3_file\n\n";
+
+my $gff3_file = $ARGV[0] or die $usage;
+
+
+main: {
+
+    my $gene_obj_indexer_href = {};
+    
+    my $asmbl_id_to_gene_list_href = &GFF3_utils::index_GFF3_gene_objs($gff3_file, $gene_obj_indexer_href);
+    
+    foreach my $asmbl_id (sort keys %$asmbl_id_to_gene_list_href) {
+        
+        my @gene_ids = @{$asmbl_id_to_gene_list_href->{$asmbl_id}};
+        
+        #print "ASMBL: $asmbl_id, gene_ids: @gene_ids\n";
+        my @gene_entries;
+        
+        foreach my $gene_id (@gene_ids) {
+            
+            my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id};
+            
+            my ($lend, $rend) = sort {$a<=>$b} $gene_obj_ref->get_coords();
+            
+            my $struct = { gene_obj => $gene_obj_ref,
+                           lend => $lend,
+                           rend => $rend,
+                           length => $rend - $lend + 1,
+            };
+            
+            push (@gene_entries, $struct);
+            
+            
+        }
+        
+        @gene_entries = reverse sort {$a->{length}<=>$b->{length}} @gene_entries;
+        
+        my @largest_orfs = shift @gene_entries;
+        
+        while (@gene_entries) {
+            my $next_gene = shift @gene_entries;
+            
+            my ($next_lend, $next_rend) = ($next_gene->{lend}, $next_gene->{rend});
+            
+            
+            my $found_eclipsed = 0;
+            
+            foreach my $gene (@largest_orfs) {
+                
+                my ($lend, $rend) = ($gene->{lend}, $gene->{rend});
+                
+                if ($next_lend > $lend && $next_rend < $rend) {
+                    ## eclipsed
+                    $found_eclipsed = 1;
+                    last;
+                }
+            }
+            
+            unless ($found_eclipsed) {
+                push (@largest_orfs, $next_gene);
+            }
+        }
+        
+
+        foreach my $struct (@largest_orfs) {
+            my $gene_obj = $struct->{gene_obj};
+            
+            print $gene_obj->to_GFF3_format(source => "transdecoder") . "\n";
+        }
+        
+        
+        
+        
+    }
+    
+    
+    exit(0);
+
+}
diff --git a/util/score_CDS_liklihood_all_6_frames.pl b/util/score_CDS_liklihood_all_6_frames.pl
new file mode 100755
index 0000000..08be5a1
--- /dev/null
+++ b/util/score_CDS_liklihood_all_6_frames.pl
@@ -0,0 +1,90 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Fasta_reader;
+use Nuc_translator;
+
+my $usage = "usage: $0 CDS hexamerScores\n\n";
+
+my $cds_file = $ARGV[0] or die $usage;
+my $hexamer_scores_file = $ARGV[1] or die $usage;
+
+
+my %scores = &parse_hexamer_scores($hexamer_scores_file);
+
+main: {
+		
+	my $fasta_reader = new Fasta_reader($cds_file);
+	while (my $seq_obj = $fasta_reader->next()) {
+		
+		my $accession = $seq_obj->get_accession();
+		my $sequence = uc $seq_obj->get_sequence();
+
+		my $score1 = &score_CDS($sequence);				
+		my $score2 = &score_CDS(substr($sequence, 1));
+		my $score3 = &score_CDS(substr($sequence, 2));
+
+		my $rev_seq = &reverse_complement($sequence);
+		
+		my $score4 = &score_CDS($rev_seq);
+		my $score5 = &score_CDS(substr($rev_seq, 1));
+		my $score6 = &score_CDS(substr($rev_seq, 2));
+		
+		printf("$accession\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n",
+			   length($sequence), 
+               $score1, $score2, $score3,
+			   $score4, $score5, $score6);
+		
+	}
+	
+	exit(0);
+
+}
+
+####
+sub score_CDS {
+	my ($sequence) = @_;
+	
+	my $seq_length = length($sequence);
+	
+	if ($seq_length < 5) {
+		return(0);
+	}
+
+	## init score to first pentamer
+	my $pentamer = substr($sequence, 0, 5);
+	my $framed_pentamer = "${pentamer}-0";
+	my $score = $scores{$framed_pentamer} || 0;
+	
+	for (my $i = 5; $i <= $seq_length - 6; $i++) {
+		my $hexamer = substr($sequence, $i, 6);
+		my $frame = $i % 3;
+		my $framed_hexamer = "${hexamer}-${frame}";
+		my $hex_score = $scores{$framed_hexamer} || 0;
+		$score += $hex_score;
+	}
+	
+	return ($score);
+}
+
+
+####
+sub parse_hexamer_scores {
+	my ($hexamer_scores_file) = @_;
+
+	my %scores;
+	open (my $fh, $hexamer_scores_file) or die "Error, cannot open $hexamer_scores_file";
+	while (<$fh>) {
+		chomp;
+		my ($token, $score) = split (/\t/);
+		$scores{$token} = $score;
+	}
+	close $fh;
+
+	return (%scores);
+}
+
diff --git a/util/seq_n_baseprobs_to_logliklihood_vals.pl b/util/seq_n_baseprobs_to_logliklihood_vals.pl
new file mode 100755
index 0000000..c0eeca0
--- /dev/null
+++ b/util/seq_n_baseprobs_to_logliklihood_vals.pl
@@ -0,0 +1,172 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Fasta_reader;
+use Nuc_translator;
+
+## hexamer stats
+my %framed_hexamers;
+my %background_hexamers;
+
+
+## pentamer stats
+my %framed_pentamers;
+my %background_base_probs;
+my %framed_all_pentamer_counts;
+
+my $usage = "usage: $0 targetCDSs base_probs.dat\n\n";
+
+my $target_CDS = $ARGV[0] or die $usage;
+my $base_probs_dat_file = $ARGV[1] or die $usage;
+my $debug = $ARGV[2];
+
+main: {
+
+	&parse_targetCDSs($target_CDS);
+
+	&parse_background($base_probs_dat_file);
+
+	&add_pseudocounts();
+
+	&report_logliklihood_ratios();
+	
+	exit(0);
+}
+
+
+
+
+
+####
+sub report_logliklihood_ratios {
+	
+
+	## Markov-based probabilities (5th order markov chain):
+	
+	foreach my $framed_hexamer (sort keys %framed_hexamers) {
+		my ($hexamer, $frame) = split (/-/, $framed_hexamer);
+	
+        if ($hexamer =~ /[^GATC]/) {
+            ## ignoring hexamers containing non-GATC bases
+            next;
+        }
+        
+		my $pentamer = substr($hexamer, 0, 5);
+
+		my $framed_hexamer_count = $framed_hexamers{$framed_hexamer};
+		my $framed_pentamer_count = $framed_pentamers{"${pentamer}-${frame}"};
+
+		my $markov_prob_framed = $framed_hexamer_count / $framed_pentamer_count;
+
+		my $last_base = substr($hexamer, 5, 1);
+		my $background_prob = $background_base_probs{$last_base} or die "Error, no background probability set for base: $last_base of hexamer $hexamer";;
+        
+        my $logliklihood = log($markov_prob_framed / $background_prob);
+            
+        print "$framed_hexamer\t$logliklihood\n";
+        
+    }
+    
+
+
+	## The Initialization Matrix based on framed pentamer frequencies.
+
+	foreach my $framed_pentamer (sort keys %framed_pentamers) {
+		
+        if ($framed_pentamer =~ /[^GATC]/) { 
+            next;
+        }
+        
+		my ($pentamer, $frame) = split (/-/, $framed_pentamer);
+
+		my $frame_counts = $framed_all_pentamer_counts{$frame};
+		my $framed_pentamer_counts = $framed_pentamers{$framed_pentamer};
+
+		my $prob_framed_pentamer = $framed_pentamer_counts / $frame_counts;
+
+		## now background
+		my @bases = split(//, $pentamer);
+        my $prob_background_pentamer = 1;
+        foreach my $base (@bases) {
+            $prob_background_pentamer *= $background_base_probs{$base};
+        }
+        
+		my $logliklihood = log($prob_framed_pentamer / $prob_background_pentamer);
+
+		print "$framed_pentamer\t$logliklihood\n";
+
+	}
+
+	return;
+}
+
+####
+sub add_pseudocounts {
+	
+	foreach my $framed_hexamer (keys %framed_hexamers) {
+		my ($hexamer, $frame) = split (/-/, $framed_hexamer);
+		
+		my $pentamer = substr($hexamer, 0, 5);
+		
+		$framed_hexamers{$framed_hexamer}++;
+		$framed_pentamers{"${pentamer}-${frame}"}++;
+		$framed_all_pentamer_counts{$frame}++;
+
+	}
+
+
+	return;
+}
+
+	
+####
+sub parse_targetCDSs {
+	my ($seqFile) = @_;
+
+	my $fasta_reader = new Fasta_reader($seqFile);
+	
+	while (my $seq_obj = $fasta_reader->next()) {
+		
+		my $accession = $seq_obj->get_accession();
+		print STDERR "\r     Target: processing $accession           " if $debug;
+		
+		my $sequence = uc $seq_obj->get_sequence();
+
+		my $seq_len = length($sequence);
+
+		for (my $i = 0; $i <= $seq_len - 5; $i++) {
+			my $frame = $i % 3;
+			my $pentamer = substr($sequence, $i, 5);
+			$framed_pentamers{"${pentamer}-${frame}"}++;
+			$framed_all_pentamer_counts{$frame}++;
+			
+			if ($i <= $seq_len - 6) { 
+				# got a hexamer
+				my $hexamer = substr($sequence, $i, 6);
+				$framed_hexamers{"${hexamer}-${frame}"}++;
+			}
+		}
+	}
+	print "\r     CDS base frequency processing complete.           \n" if $debug;
+	return;
+}
+
+#### 
+sub parse_background {
+	my ($base_probs_dat_file) = @_;
+
+    open (my $fh, $base_probs_dat_file) or die "Error, cannot open file $base_probs_dat_file";
+    while (<$fh>) {
+        chomp;
+        my ($base, $count, $ratio) = split(/\t/);
+        $background_base_probs{$base} = $ratio;
+    }
+    close $fh;
+        	
+	return;
+}
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/transdecoder.git



More information about the debian-med-commit mailing list