[med-svn] [transdecoder] 01/01: Imported Upstream version 3.0.0+dfsg
Michael Crusoe
misterc-guest at moszumanska.debian.org
Fri May 20 07:50:00 UTC 2016
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to annotated tag upstream/3.0.0+dfsg
in repository transdecoder.
commit f9ac657db54bd4d1eb376b80ea796e2e4300cfc3
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date: Wed May 18 02:30:59 2016 -0700
Imported Upstream version 3.0.0+dfsg
---
.gitmodules | 3 +
PerlLib/Gene_obj.pm | 20 +++
TransDecoder.LongOrfs | 81 +++++++--
TransDecoder.Predict | 99 ++++++++---
notes | 3 +
sample_data/Makefile | 14 ++
sample_data/cufflinks_example/Makefile | 6 +
sample_data/cufflinks_example/blastp.outfmt6.gz | Bin 0 -> 1957 bytes
.../cufflinks_example/blastp.results.outfmt6.gz | Bin 1808 -> 0 bytes
sample_data/cufflinks_example/cleanme.pl | 7 +-
sample_data/cufflinks_example/pfam.domtblout.gz | Bin 12838 -> 12460 bytes
sample_data/cufflinks_example/runMe.sh | 41 +++--
sample_data/pasa_example/Makefile | 6 +
sample_data/pasa_example/cleanme.pl | 2 +
sample_data/pasa_example/pasa_assemblies.fasta.gz | Bin 304896 -> 304706 bytes
sample_data/pasa_example/pasa_assemblies.gff3.gz | Bin 55455 -> 55467 bytes
.../pasa_example/pasa_assemblies_described.txt.gz | Bin 0 -> 231156 bytes
sample_data/pasa_example/runMe.sh | 17 +-
sample_data/simple_transcriptome_target/Makefile | 6 +
sample_data/simple_transcriptome_target/cleanme.pl | 1 +
sample_data/simple_transcriptome_target/runMe.sh | 4 +-
util/cdna_alignment_orf_to_genome_orf.pl | 62 ++++---
util/remove_eclipsed_ORFs.pl | 4 +-
util/single_best_ORF_per_transcript.pl | 187 +++++++++++++++++++++
24 files changed, 472 insertions(+), 91 deletions(-)
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..70b3ae8
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "TransDecoder.lrgTests"]
+ path = TransDecoder.lrgTests
+ url = https://github.com/TransDecoder/TransDecoder.lrgTests.git
diff --git a/PerlLib/Gene_obj.pm b/PerlLib/Gene_obj.pm
index 990011c..86ea2e4 100755
--- a/PerlLib/Gene_obj.pm
+++ b/PerlLib/Gene_obj.pm
@@ -4450,6 +4450,26 @@ sub has_CDS {
}
+####
+sub set_CDS_phases_from_init_phase {
+ my ($self, $init_phase) = @_;
+
+ my @exons = $self->get_exons();
+
+ my $curr_cds_len = $init_phase;
+
+ foreach my $exon (@exons) {
+ if (my $cds = $exon->get_CDS_obj()) {
+ $cds->set_phase($curr_cds_len % 3);
+ my $cds_len = $cds->length();
+ $curr_cds_len += $cds_len;
+ }
+ }
+
+ return;
+}
+
+
sub set_CDS_phases {
my ($self, $genomic_seq_ref) = @_;
diff --git a/TransDecoder.LongOrfs b/TransDecoder.LongOrfs
index 68b30a4..8935687 100755
--- a/TransDecoder.LongOrfs
+++ b/TransDecoder.LongOrfs
@@ -16,6 +16,8 @@ Required:
Optional:
+ --gene_trans_map <string> gene-to-transcript identifier mapping file (tab-delimited, gene_id<tab>trans_id<return> )
+
-m <int> minimum protein length (default: 100)
-G <string> genetic code (default: universal; see PerlDoc; options: Euplotes, Tetrahymena, Candida, Acetabularia)
@@ -83,7 +85,7 @@ my $workdir;
my $verbose;
my $search_pfam = "";
my ($reuse,$pfam_out);
-
+my $gene_trans_map_file;
my $MPI_DEBUG = 1;
@@ -93,8 +95,9 @@ my $MPI_DEBUG = 1;
'h' => \$help,
'v' => \$verbose,
'S' => \$TOP_STRAND_ONLY,
- 'p=i' => \$preserve_full_lengths
- );
+ 'p=i' => \$preserve_full_lengths,
+ 'gene_trans_map=s' => \$gene_trans_map_file,
+ );
@@ -116,6 +119,20 @@ if ($genetic_code ne 'universal') {
main: {
+
+ my %gene_trans_map;
+ if ($gene_trans_map_file) {
+ open(my $fh, $gene_trans_map_file) or die "Error, cannot open file $gene_trans_map_file";
+ while (<$fh>) {
+ chomp;
+ my ($gene_id, $trans_id) = split(/\t/);
+ $gene_trans_map{$trans_id} = $gene_id;
+ }
+ close $fh;
+ }
+
+
+
my $workdir = basename($transcripts_file) . ".transdecoder_dir";
unless (-d $workdir) {
mkdir($workdir) or die "Error, cannot mkdir $workdir.";
@@ -175,9 +192,11 @@ main: {
my @orf_structs = $longest_orf_finder->capture_all_ORFs($sequence);
@orf_structs = reverse sort {$a->{length}<=>$b->{length}} @orf_structs;
- print "checking for fp in $acc \n" if ($SEE);
- $longest_orf_finder->check_for_fp_5prime_partials($MIN_PROT_LENGTH,\@orf_structs,$preserve_full_lengths)
- if(defined($preserve_full_lengths));
+ print "checking for fp in $acc \n" if ($SEE);
+
+ if (defined($preserve_full_lengths)) {
+ $longest_orf_finder->check_for_fp_5prime_partials($MIN_PROT_LENGTH,\@orf_structs,$preserve_full_lengths)
+ }
while (@orf_structs) {
my $orf = shift @orf_structs;
@@ -229,19 +248,32 @@ main: {
$model_counter++;
- my $model_id = "$acc|m.$model_counter";
- my $gene_id = "$acc|g.$model_counter";
+
+ my $gene_id;
+ if (%gene_trans_map) {
+ $gene_id = $gene_trans_map{$acc} or die "Error, cannot locate gene identifier for transcript acc: [$acc]";
+ }
+ elsif (my $parsed_gene_id = &try_parse_gene_id_from_acc($acc)) {
+ $gene_id = $parsed_gene_id;
+ }
+ else {
+ $gene_id = "Gene.$model_counter";
+ }
+ ## gene ID and model ID must be unique for each entry, but also decipherable for later on.
+ $gene_id = "${gene_id}::${acc}::g.$model_counter";
+ my $model_id = "${gene_id}::m.$model_counter";
+
$gene_obj->{TU_feat_name} = $gene_id;
$gene_obj->{Model_feat_name} = $model_id;
-
-
+
my $cds = $gene_obj->create_CDS_sequence(\$sequence);
-
+ $gene_obj->set_CDS_phases(\$sequence);
+
unless ($cds) {
die "Error, no CDS for gene: " . Dumper($cds_coords_href) . Dumper($exon_coords_href);
}
-
+
my $got_start = 0;
my $got_stop = 0;
if ($protein =~ /^M/) {
@@ -262,22 +294,24 @@ main: {
$prot_type = "internal";
}
- $gene_obj->{com_name} = "ORF $gene_id $model_id type:$prot_type len:$length ($orient)";
+ $gene_obj->{com_name} = "ORF type:$prot_type len:$length ($orient)";
# this header is identical between CDS and PEP (since PEP is just a direct translation of CDS for a specific translation table)
# we are currently not printing this out at the final data but it would be nice to.
- my $pep_header = ">$model_id $gene_id type:$prot_type len:$length gc:$genetic_code $acc:$start-$stop($orient)\n";
- my $cds_header = ">$model_id $gene_id type:$prot_type len:$length $acc:$start-$stop($orient)\n";
+ my $pep_header = ">$model_id type:$prot_type len:$length gc:$genetic_code $acc:$start-$stop($orient)\n";
+ my $cds_header = ">$model_id type:$prot_type len:$length $acc:$start-$stop($orient)\n";
print PEP $pep_header."$protein\n";
print CDS $cds_header."$cds\n";
+
+
print GFF $gene_obj->to_GFF3_format(source => "transdecoder") . "\n";
}
}
-
+
close PEP;
close CDS;
close GFF;
@@ -313,3 +347,18 @@ sub process_cmd {
}
+####
+sub try_parse_gene_id_from_acc {
+ my ($acc) = @_;
+
+ my $gene_id;
+
+ if ($acc =~ /^(\S+_c\d+_g\d+)_i\d+$/) {
+ $gene_id = $1;
+ }
+ elsif ($acc =~ /^(\S+_c\d+)_seq\d+$/) {
+ $gene_id = $1;
+ }
+
+ return($gene_id);
+}
diff --git a/TransDecoder.Predict b/TransDecoder.Predict
index a94080a..838aea4 100755
--- a/TransDecoder.Predict
+++ b/TransDecoder.Predict
@@ -17,10 +17,14 @@ Common options:
--retain_long_orfs <int> retain all ORFs found that are equal or longer than these many nucleotides even if no other evidence
marks it as coding (default: 900 bp => 300aa)
- --retain_pfam_hits <string> /path/to/pfam_db.hmm to search
- using hmmscan (which should be accessible via your PATH setting)
+ --retain_pfam_hits <string> domain table output file from running hmmscan to search Pfam (see transdecoder.github.io for info)
+ Any ORF with a pfam domain hit will be retained in the final output.
- --retain_blastp_hits <string>
+ --retain_blastp_hits <string> blastp output in '-outfmt 6' format.
+ Any ORF with a blast match will be retained in the final output.
+
+ --single_best_orf Retain only the single best ORF per transcript.
+ (Best is defined as having (optionally pfam and/or blast support) and longest orf)
--cpu <int> Use multipe cores for cd-hit-est. (default=1)
@@ -78,6 +82,7 @@ my $retain_pfam_hits_file;
my $retain_blastp_hits_file;
my $cpu = 1;
my $MPI_DEBUG = 1;
+my $single_best_orf_flag = 0;
&GetOptions( 't=s' => \$transcripts_file,
'train:s' => \$train_file,
@@ -97,6 +102,8 @@ my $MPI_DEBUG = 1;
'retain_pfam_hits=s' => \$retain_pfam_hits_file,
'retain_blastp_hits=s' => \$retain_blastp_hits_file,
'cpu=i' => \$cpu,
+
+ 'single_best_orf' => \$single_best_orf_flag,
);
@@ -192,35 +199,59 @@ main: {
# get accs for best entries
my $acc_file = "$cds_file.scores.selected";
{
+
+ my %att_counter;
+
open (my $ofh, ">$acc_file") or die "Error, cannot write to $acc_file";
open (my $ifh, "$cds_file.scores") or die "Error, cannot open file $cds_file.scores";
while (<$ifh>) {
chomp;
my ($acc, $orf_length, @scores) = split(/\t/);
+
+ my @ATTS;
my $score_1 = shift @scores;
my $max_score_other_frame = max(@scores);
- if ($has_pfam_hit{$acc}
- ||
- $has_blastp_hit{$acc}
- ||
- $orf_length >= $RETAIN_LONG_ORFS
- ||
- ($score_1 > 0 && $score_1 > $max_score_other_frame)
- ) {
+
+ my $keep_flag = 0;
+
+ if ($has_pfam_hit{$acc}) {
+ $keep_flag = 1;
+ push (@ATTS, "PFAM");
+ print STDERR "-$acc flagged as having a pfam domain.\n" if $verbose;
+ }
+ if ($has_blastp_hit{$acc}) {
+ $keep_flag = 1;
+ push (@ATTS, "BLASTP");
+ print STDERR "-$acc flagged as having a blastp match.\n" if $verbose;
+ }
+ if ($orf_length >= $RETAIN_LONG_ORFS) {
+ $keep_flag = 1;
+ push (@ATTS, "LONGORF");
+ }
+ if ($score_1 > 0 && $score_1 > $max_score_other_frame) {
+ $keep_flag = 1;
+ push (@ATTS, "FRAMESCORE");
+ }
+
+ if ($keep_flag) {
print $ofh "$acc\n";
-
- if ($has_pfam_hit{$acc}) {
- print STDERR "-$acc flagged as having a pfam domain.\n" if $verbose;
- }
- if ($has_blastp_hit{$acc}) {
- print STDERR "-$acc flagged as having a blastp match.\n" if $verbose;
- }
-
+
+ my $att_string = join("|", sort @ATTS);
+ $att_counter{$att_string}++;
}
}
close $ifh;
close $ofh;
+
+
+ # report on the categories of the selected ORFs
+ print STDERR "\n#####################\nCounts of kept entries according to attributes:\n";
+ foreach my $att (reverse sort {$att_counter{$a}<=>$att_counter{$b}} keys %att_counter) {
+ my $count = $att_counter{$att};
+ print STDERR join("\t", $att, $count) . "\n";
+ }
+ print STDERR "########################\n\n\n";
}
# index the current gff file:
@@ -241,8 +272,30 @@ main: {
{
# exclude shadow orfs (smaller orfs in different reading frame that are eclipsed by longer orfs)
- $cmd = "$UTIL_DIR/remove_eclipsed_ORFs.pl $cds_file.best_candidates.gff3 > $final_output_prefix.gff3";
+ $cmd = "$UTIL_DIR/remove_eclipsed_ORFs.pl $cds_file.best_candidates.gff3 > $cds_file.eclipsed_removed.gff3";
+
+ my $target_final_file = "$cds_file.eclipsed_removed.gff3";
+
&process_cmd($cmd);
+
+
+ if ($single_best_orf_flag) {
+ $cmd = "$UTIL_DIR/single_best_ORF_per_transcript.pl ";
+ if ($retain_blastp_hits_file) {
+ $cmd .= " --blast_hits $retain_blastp_hits_file ";
+ }
+ if ($retain_pfam_hits_file) {
+ $cmd .= " --pfam_hits $retain_pfam_hits_file ";
+ }
+ $cmd .= " --gff3_file $cds_file.eclipsed_removed.gff3 > $cds_file.single_best_orf.gff3";
+
+ &process_cmd($cmd);
+
+ $target_final_file = "$cds_file.single_best_orf.gff3";
+ }
+
+
+ &process_cmd("cp $target_final_file $final_output_prefix.gff3");
## write final outputs:
@@ -268,12 +321,6 @@ main: {
$cmd = "$UTIL_DIR/gff3_file_to_proteins.pl $gff3_file $transcripts_file CDS > $best_cds_file";
&process_cmd($cmd);
- # make a CDS file:
- my $best_cdna_file = $best_pep_file;
- $best_cdna_file =~ s/\.pep$/\.mRNA/;
- $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl $gff3_file $transcripts_file cDNA > $best_cdna_file";
- &process_cmd($cmd);
-
}
print STDERR "transdecoder is finished. See output files $final_output_prefix.\*\n\n\n";
diff --git a/notes b/notes
index 7806c64..8cd2de1 100644
--- a/notes
+++ b/notes
@@ -1 +1,4 @@
git clone https://github.com/TransDecoder/TransDecoder.git
+
+GeneID paper:
+http://genome.cshlp.org/content/10/4/511.long
diff --git a/sample_data/Makefile b/sample_data/Makefile
new file mode 100644
index 0000000..451b9a5
--- /dev/null
+++ b/sample_data/Makefile
@@ -0,0 +1,14 @@
+
+DIRS=cufflinks_example pasa_example simple_transcriptome_target
+
+test:
+ @for i in $(DIRS); do \
+ echo "Running example in $$$i..."; \
+ (cd $$i; $(MAKE) test) || exit $$?; done
+
+
+clean:
+ @for i in $(DIRS); do \
+ echo "Running example in $$i..."; \
+ (cd $$i; $(MAKE) clean) || exit $$?; done
+
diff --git a/sample_data/cufflinks_example/Makefile b/sample_data/cufflinks_example/Makefile
new file mode 100644
index 0000000..4ab2c6d
--- /dev/null
+++ b/sample_data/cufflinks_example/Makefile
@@ -0,0 +1,6 @@
+test:
+ ./runMe.sh
+
+clean:
+ ./cleanme.pl
+
diff --git a/sample_data/cufflinks_example/blastp.outfmt6.gz b/sample_data/cufflinks_example/blastp.outfmt6.gz
new file mode 100644
index 0000000..e3824ef
Binary files /dev/null and b/sample_data/cufflinks_example/blastp.outfmt6.gz differ
diff --git a/sample_data/cufflinks_example/blastp.results.outfmt6.gz b/sample_data/cufflinks_example/blastp.results.outfmt6.gz
deleted file mode 100644
index 09e79bf..0000000
Binary files a/sample_data/cufflinks_example/blastp.results.outfmt6.gz and /dev/null differ
diff --git a/sample_data/cufflinks_example/cleanme.pl b/sample_data/cufflinks_example/cleanme.pl
index f5f72ad..1dfa3d9 100755
--- a/sample_data/cufflinks_example/cleanme.pl
+++ b/sample_data/cufflinks_example/cleanme.pl
@@ -17,11 +17,12 @@ my @files_to_keep = qw (cleanme.pl
test.tophat.sam.gz
transcripts.gtf.gz
-
+blastp.outfmt6.gz
pfam.domtblout.gz
-blastp.results.outfmt6.gz
- );
+Makefile
+
+ );
my %keep = map { + $_ => 1 } @files_to_keep;
diff --git a/sample_data/cufflinks_example/pfam.domtblout.gz b/sample_data/cufflinks_example/pfam.domtblout.gz
index 9b7af55..7a22fa6 100644
Binary files a/sample_data/cufflinks_example/pfam.domtblout.gz and b/sample_data/cufflinks_example/pfam.domtblout.gz differ
diff --git a/sample_data/cufflinks_example/runMe.sh b/sample_data/cufflinks_example/runMe.sh
index fc27d70..450faa2 100755
--- a/sample_data/cufflinks_example/runMe.sh
+++ b/sample_data/cufflinks_example/runMe.sh
@@ -1,22 +1,21 @@
-#!/bin/bash -e
+#!/bin/bash -ve
-if [ -e test.genome.fasta.gz ] && [ ! -e test.genome.fasta ]; then
+export PERL_HASH_SEED=0
+
+if [ ! -e test.genome.fasta ]; then
gunzip -c test.genome.fasta.gz > test.genome.fasta
fi
-if [ -e test.tophat.sam.gz ] && [ ! -e test.tophat.sam ]; then
- gunzip -c test.tophat.sam.gz > test.tophat.sam
-fi
-if [ -e transcripts.gtf.gz ] && [ ! -e transcripts.gtf ]; then
+if [ ! -e transcripts.gtf ]; then
gunzip -c transcripts.gtf.gz > transcripts.gtf
fi
-if [ -e blastp.results.outfmt6.gz ] && [ ! -e blastp.results.outfmt6 ]; then
- gunzip -c blastp.results.outfmt6.gz > blastp.results.outfmt6
+if [ ! -e blastp.outfmt6 ]; then
+ gunzip -c blastp.outfmt6.gz > blastp.outfmt6
fi
-if [ -e pfam.domtblout.gz ] && [ ! -e pfam.domtblout ]; then
+if [ ! -e pfam.domtblout ]; then
gunzip -c pfam.domtblout.gz > pfam.domtblout
fi
@@ -32,9 +31,21 @@ fi
## Predict likely ORFs
-if [ $1 ]; then
+if [ 1 ]; then # always doing this now.
+
+ # this is how I would have run blast and pfam but I'm using precomputed results for ease of demonstration.
+ #BLASTDB=/seq/RNASEQ/DBs/TRINOTATE_RESOURCES/TRINOTATE_V3/uniprot_sprot.pep
+ #PFAMDB=/seq/RNASEQ/DBs/TRINOTATE_RESOURCES/TRINOTATE_V3/Pfam-A.hmm
+ #
+ ## run blast
+ #blastp -query transcripts.fasta.transdecoder_dir/longest_orfs.pep -db $BLASTDB -max_target_seqs 1 -outfmt 6 -evalue 1e-5 > blastp.outfmt6
+ #
+ ## run pfam
+ #hmmscan --domtblout pfam.domtblout $PFAMDB transcripts.fasta.transdecoder_dir/longest_orfs.pep > pfam.log
+
+
## use pfam and blast results:
- ../../TransDecoder.Predict -t transcripts.fasta --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.results.outfmt6 -v
+ ../../TransDecoder.Predict -t transcripts.fasta --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.outfmt6 --single_best_orf -v
else
# just coding metrics
../../TransDecoder.Predict -t transcripts.fasta
@@ -52,10 +63,8 @@ fi
# convert the genome-based gene-gff3 file to bed
../../util/gff3_file_to_bed.pl transcripts.fasta.transdecoder.genome.gff3 > transcripts.fasta.transdecoder.genome.bed
-echo
-echo
-echo Done! Coding region genome annotations provided as: best_candidates.eclipsed_orfs_removed.genome.gff3
-echo
-echo
+
+# Done! Coding region genome annotations provided as: transcripts.fasta.transdecoder.genome.\*
+
exit 0
diff --git a/sample_data/pasa_example/Makefile b/sample_data/pasa_example/Makefile
new file mode 100644
index 0000000..4ab2c6d
--- /dev/null
+++ b/sample_data/pasa_example/Makefile
@@ -0,0 +1,6 @@
+test:
+ ./runMe.sh
+
+clean:
+ ./cleanme.pl
+
diff --git a/sample_data/pasa_example/cleanme.pl b/sample_data/pasa_example/cleanme.pl
index 88f5233..2278c9a 100755
--- a/sample_data/pasa_example/cleanme.pl
+++ b/sample_data/pasa_example/cleanme.pl
@@ -17,6 +17,8 @@ genome.fasta.gz
pasa_assemblies.fasta.gz
pasa_assemblies.gff3.gz
runMe.sh
+pasa_assemblies_described.txt.gz
+Makefile
);
diff --git a/sample_data/pasa_example/pasa_assemblies.fasta.gz b/sample_data/pasa_example/pasa_assemblies.fasta.gz
index 3eca927..0da5588 100644
Binary files a/sample_data/pasa_example/pasa_assemblies.fasta.gz and b/sample_data/pasa_example/pasa_assemblies.fasta.gz differ
diff --git a/sample_data/pasa_example/pasa_assemblies.gff3.gz b/sample_data/pasa_example/pasa_assemblies.gff3.gz
index bdb2aac..c635200 100644
Binary files a/sample_data/pasa_example/pasa_assemblies.gff3.gz and b/sample_data/pasa_example/pasa_assemblies.gff3.gz differ
diff --git a/sample_data/pasa_example/pasa_assemblies_described.txt.gz b/sample_data/pasa_example/pasa_assemblies_described.txt.gz
new file mode 100644
index 0000000..4e29025
Binary files /dev/null and b/sample_data/pasa_example/pasa_assemblies_described.txt.gz differ
diff --git a/sample_data/pasa_example/runMe.sh b/sample_data/pasa_example/runMe.sh
index 6470d43..4a625c1 100755
--- a/sample_data/pasa_example/runMe.sh
+++ b/sample_data/pasa_example/runMe.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/bash -ve
if [ ! -e genome.fasta ]; then
gunzip -c genome.fasta.gz > genome.fasta
@@ -12,10 +12,21 @@ if [ ! -e pasa_assemblies.gff3 ]; then
gunzip -c pasa_assemblies.gff3.gz > pasa_assemblies.gff3
fi
-../../TransDecoder.LongOrfs -t pasa_assemblies.fasta
+if [ ! -e pasa_assemblies_described.txt ]; then
+ gunzip -c pasa_assemblies_described.txt.gz > pasa_assemblies_described.txt
+fi
+
+
+# get the gene-to-transcript relationships
+cut -f2,3 pasa_assemblies_described.txt > pasa.gene_trans_map.txt
+
+../../TransDecoder.LongOrfs -t pasa_assemblies.fasta --gene_trans_map pasa.gene_trans_map.txt
../../TransDecoder.Predict -t pasa_assemblies.fasta
-../../util/cdna_alignment_orf_to_genome_orf.pl pasa_assemblies.fasta.transdecoder.gff3 pasa_assemblies.gff3 pasa_assemblies.fasta | tee pasa_assemblies.fasta.transdecoder.genome.gff3
+../../util/cdna_alignment_orf_to_genome_orf.pl pasa_assemblies.fasta.transdecoder.gff3 pasa_assemblies.gff3 pasa_assemblies.fasta > pasa_assemblies.fasta.transdecoder.genome.gff3
+
+echo "Done. See pasa_assemblies.fasta.transdecoder.\*"
+
exit 0
diff --git a/sample_data/simple_transcriptome_target/Makefile b/sample_data/simple_transcriptome_target/Makefile
new file mode 100644
index 0000000..4ab2c6d
--- /dev/null
+++ b/sample_data/simple_transcriptome_target/Makefile
@@ -0,0 +1,6 @@
+test:
+ ./runMe.sh
+
+clean:
+ ./cleanme.pl
+
diff --git a/sample_data/simple_transcriptome_target/cleanme.pl b/sample_data/simple_transcriptome_target/cleanme.pl
index bbe0088..de8482d 100755
--- a/sample_data/simple_transcriptome_target/cleanme.pl
+++ b/sample_data/simple_transcriptome_target/cleanme.pl
@@ -15,6 +15,7 @@ my @files_to_keep = qw (
cleanme.pl
runMe.sh
Trinity.fasta.gz
+Makefile
);
diff --git a/sample_data/simple_transcriptome_target/runMe.sh b/sample_data/simple_transcriptome_target/runMe.sh
index c42569a..3ad9343 100755
--- a/sample_data/simple_transcriptome_target/runMe.sh
+++ b/sample_data/simple_transcriptome_target/runMe.sh
@@ -4,8 +4,8 @@ if [ ! -e Trinity.fasta ]; then
gunzip -c Trinity.fasta.gz > Trinity.fasta
fi
-../../TransDecoder.LongOrfs -t Trinity.fasta
+../../TransDecoder.LongOrfs -t Trinity.fasta
-../../TransDecoder.Predict -t Trinity.fasta
+../../TransDecoder.Predict -t Trinity.fasta --single_best_orf
exit 0
diff --git a/util/cdna_alignment_orf_to_genome_orf.pl b/util/cdna_alignment_orf_to_genome_orf.pl
index 4a3b1ee..84d0d55 100755
--- a/util/cdna_alignment_orf_to_genome_orf.pl
+++ b/util/cdna_alignment_orf_to_genome_orf.pl
@@ -41,28 +41,20 @@ main: {
## associate gene identifiers with contig id's.
my $contig_to_gene_list_href = &GFF3_utils::index_GFF3_gene_objs($cdna_orfs_gff3, $gene_obj_indexer_href);
- foreach my $asmbl_id (sort keys %$contig_to_gene_list_href) {
+
+ my %isolated_gene_id_to_new_genes;
+ foreach my $asmbl_id (sort keys %$contig_to_gene_list_href) {
+ ## $asmbl_id is the actual Transcript identifier from which ORFs were predicted.
+
my @gene_ids = @{$contig_to_gene_list_href->{$asmbl_id}};
foreach my $gene_id (@gene_ids) { # gene identifiers as given by transdecoder on the transcript sequences
my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id};
- my $asmbl_id = $gene_obj_ref->{asmbl_id};
-
-
- ## pasa stuff
-
- if ($asmbl_id =~ /(S\d+)_(asmbl_\d+)/) {
-
- my $subcluster = $1;
- $asmbl_id = $2;
- }
-
my $transcript_struct = $cdna_acc_to_transcript_structure{$asmbl_id} or die "Error, no cdna struct for $asmbl_id";
- #print Dumper($transcript_struct) . $gene_obj_ref->toString();
-
+
eval {
my $new_orf_gene = &place_orf_in_cdna_alignment_context($transcript_struct, $gene_obj_ref, \%cdna_seq_lengths);
@@ -74,14 +66,17 @@ main: {
#$new_orf_gene->{Model_feat_name} = "m.$asmbl_id.$orf_count";
$new_orf_gene->{com_name} = "ORF";
- my $use_gene_id = $transcript_struct->{gene_id} || $gene_id;
-
-
+ my $use_gene_id = $transcript_struct->{gene_id};
+ unless ($use_gene_id) {
+ ## extract the orig gene id from the incoming gene obj
+ my ($isolated_gene_id, @rest) = split(/::/, $gene_id);
+ $use_gene_id = $isolated_gene_id;
+ }
+
$new_orf_gene->{TU_feat_name} = $use_gene_id;
$new_orf_gene->{Model_feat_name} = $gene_obj_ref->{Model_feat_name};
-
-
- print $new_orf_gene->to_GFF3_format(source => "transdecoder") . "\n";
+
+ push (@{$isolated_gene_id_to_new_genes{$use_gene_id}}, $new_orf_gene);
}
};
@@ -99,6 +94,25 @@ main: {
}
}
+ ## output results:
+ foreach my $gene_id (sort keys %isolated_gene_id_to_new_genes) {
+
+ my @gene_objs = @{$isolated_gene_id_to_new_genes{$gene_id}};
+
+ foreach my $gene_obj (@gene_objs) {
+ $gene_obj->set_CDS_phases_from_init_phase(0);
+ }
+
+ my $parent_gene_obj = shift @gene_objs;
+ foreach my $gene_obj (@gene_objs) {
+ $parent_gene_obj->add_isoform($gene_obj);
+ }
+
+ $parent_gene_obj->refine_gene_object();
+
+ print $parent_gene_obj->to_GFF3_format(source => "transdecoder") . "\n";
+ }
+
exit(0);
}
@@ -126,7 +140,7 @@ sub parse_transcript_alignment_info {
my $trans_id = "";
my $gene_id = "";
-
+ ## trick for retaining gene and trans identifier information from cufflinks data
if ($asmbl =~ /^GENE\^(\S+),TRANS\^(\S+)/) {
$gene_id = $1;
$trans_id = $2;
@@ -147,12 +161,12 @@ sub parse_transcript_alignment_info {
[$lend, $rend]
],
- orient => $orient,
+ orient => $orient,
trans_id => $trans_id,
gene_id => $gene_id,
};
-
+
$cdna_alignments{$asmbl} = $struct;
}
@@ -230,7 +244,7 @@ sub place_orf_in_cdna_alignment_context {
if (scalar(@exon_coords) > 1) {
# any correct ORF should be in the '+' orientation here.... must be a false positive orf or transcript structure is wrong
$WARNING_COUNT++;
- print STDERR "Warning [$WARNING_COUNT], shouldn't have a minus-strand ORF on a spliced transcript structure. Skipping entry.\n";
+ print STDERR "Warning [$WARNING_COUNT], shouldn't have a minus-strand ORF on a spliced transcript structure. Skipping entry $orf_gene_obj->{Model_feat_name}.\n";
return undef;
}
diff --git a/util/remove_eclipsed_ORFs.pl b/util/remove_eclipsed_ORFs.pl
index 92e7fd1..4cc3e8d 100755
--- a/util/remove_eclipsed_ORFs.pl
+++ b/util/remove_eclipsed_ORFs.pl
@@ -33,7 +33,7 @@ main: {
my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id};
- my ($lend, $rend) = sort {$a<=>$b} $gene_obj_ref->get_coords();
+ my ($lend, $rend) = sort {$a<=>$b} $gene_obj_ref->get_model_span();
my $struct = { gene_obj => $gene_obj_ref,
lend => $lend,
@@ -64,6 +64,8 @@ main: {
if ($next_lend > $lend && $next_rend < $rend) {
## eclipsed
+ my $model_feat_name = $gene->{gene_obj}->{Model_feat_name};
+ print STDERR "\tECLIPSE: $model_feat_name is eclipsed by longer ORF, removing it.\n";
$found_eclipsed = 1;
last;
}
diff --git a/util/single_best_ORF_per_transcript.pl b/util/single_best_ORF_per_transcript.pl
new file mode 100755
index 0000000..f34f3a3
--- /dev/null
+++ b/util/single_best_ORF_per_transcript.pl
@@ -0,0 +1,187 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use FindBin;
+use lib ("$FindBin::Bin/../PerlLib");
+use Gene_obj;
+use Gene_obj_indexer;
+use GFF3_utils;
+use Carp;
+use Data::Dumper;
+use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
+
+
+
+my $usage = <<__EOUSAGE__;
+
+################################################################################
+#
+# Required:
+#
+# --gff3_file <string> gff3 file w/ predicted ORFs
+#
+# Optional:
+#
+# --blast_hits <string> blast hits file
+#
+# --pfam_hits <string> pfam hits file
+#
+################################################################################
+
+
+__EOUSAGE__
+
+ ;
+
+my $gff3_file;
+my $blast_hits_file;
+my $pfam_hits_file;
+
+
+&GetOptions('gff3_file=s' => \$gff3_file,
+ 'blast_hits=s' => \$blast_hits_file,
+ 'pfam_hits=s' => \$pfam_hits_file,
+ );
+
+unless ($gff3_file) {
+ die $usage;
+}
+
+
+main: {
+
+ my %blast_hits;
+ if ($blast_hits_file) {
+ %blast_hits = &parse_blastp_hits_file($blast_hits_file);
+ }
+
+ my %pfam_hits;
+ if ($pfam_hits_file) {
+ %pfam_hits = &parse_pfam_hits_file($pfam_hits_file);
+ }
+
+ my $gene_obj_indexer_href = {};
+
+ my $asmbl_id_to_gene_list_href = &GFF3_utils::index_GFF3_gene_objs($gff3_file, $gene_obj_indexer_href);
+
+ foreach my $asmbl_id (sort keys %$asmbl_id_to_gene_list_href) {
+
+ my @gene_ids = @{$asmbl_id_to_gene_list_href->{$asmbl_id}};
+
+ #print "ASMBL: $asmbl_id, gene_ids: @gene_ids\n";
+ my @gene_entries;
+
+ foreach my $gene_id (@gene_ids) {
+
+ my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id};
+
+ my $model_id = $gene_obj_ref->{Model_feat_name};
+
+ my $homology_count = 0;
+ if ($blast_hits{$model_id}) {
+ $homology_count++;
+ }
+ if ($pfam_hits{$model_id}) {
+ $homology_count++;
+ }
+
+
+ my $struct = { gene_obj => $gene_obj_ref,
+ length => $gene_obj_ref->get_CDS_length(),
+ homology_count => $homology_count,
+ };
+
+ push (@gene_entries, $struct);
+
+
+ }
+
+ @gene_entries = sort {
+
+ $b->{homology_count} <=> $a->{homology_count}
+ ||
+ $b->{length} <=> $a->{length}
+
+ } @gene_entries;
+
+ if (scalar @gene_entries > 1) {
+ print STDERR "ORFs prioritized as:\n";
+ foreach my $entry (@gene_entries) {
+ print STDERR "\t" . join("\t", $entry->{gene_obj}->{Model_feat_name},
+ "homology_count: " . $entry->{homology_count},
+ "len: " . $entry->{length}) . "\n";
+ }
+
+ }
+
+ my $best_gene_entry = shift @gene_entries;
+
+ my $gene_obj = $best_gene_entry->{gene_obj};
+
+ print $gene_obj->to_GFF3_format(source => "transdecoder") . "\n";
+
+ foreach my $remaining_gene (@gene_entries) {
+ print STDERR "-discarding " . $remaining_gene->{gene_obj}->{Model_feat_name} . " as not single best orf on trans\n";
+ }
+
+ }
+
+
+ exit(0);
+
+}
+
+
+
+## note borrowed code below from TransDecoder.predict and should consolidate. ## //FIXME
+
+sub parse_pfam_hits_file {
+ my ($pfam_hits_file) = @_;
+
+ my %has_pfam_hit;
+
+ if (! -e $pfam_hits_file) {
+ die "Error, cannot find pfam hits file: $pfam_hits_file";
+ }
+
+ print "PFAM output found and processing...\n";
+ # capture those proteins having pfam hits
+ open (my $fh, $pfam_hits_file) or die "Error, cannot open file: $pfam_hits_file";
+ while (my $ln=<$fh>) {
+ next if $ln=~/^\#/;
+ my @x = split(/\s+/,$ln);
+ next unless $x[3]; # domtbl
+ my $orf_acc = $x[3];
+ $has_pfam_hit{$orf_acc} = 1;
+ }
+ close $fh;
+
+
+ return(%has_pfam_hit);
+}
+
+####
+sub parse_blastp_hits_file {
+ my ($blastp_file) = @_;
+
+ unless (-e $blastp_file) {
+ die "Error, cannot find file $blastp_file";
+ }
+
+ my %blastp_hits;
+
+ open (my $fh, $blastp_file) or die "Error, cannot open file $blastp_file";
+ while (<$fh>) {
+ chomp;
+ my @x = split(/\t/);
+ my $id = $x[0];
+
+ $blastp_hits{$id} = 1;
+ }
+ close $fh;
+
+ return(%blastp_hits);
+}
+
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/transdecoder.git
More information about the debian-med-commit
mailing list