[med-svn] [Git][med-team/rsem][upstream] New upstream version 1.3.1+dfsg

Wed Sep 26 20:44:32 BST 2018

Andreas Tille pushed to branch upstream at Debian Med / rsem


Commits:
2dfc156d by Andreas Tille at 2018-09-26T19:02:01Z
New upstream version 1.3.1+dfsg
- - - - -


12 changed files:

- Gibbs.cpp
- README.md
- convert-sam-for-rsem
- pRSEM/File.py
- rsem-calculate-expression
- rsem-control-fdr
- rsem-generate-ngvector
- rsem-gff3-to-gtf
- rsem-plot-transcript-wiggles
- rsem-prepare-reference
- rsem-run-ebseq
- rsem_perl_utils.pm


Changes:

=====================================
Gibbs.cpp
=====================================
@@ -432,7 +432,7 @@ int main(int argc, char* argv[]) {
     printf("- One isoform's prior per line\n");
     printf("- Priors must be in the same order as in the .ti file\n");
     printf("- Priors for those to-be-omitted isoforms must be included as well\n");
-    printf("- Comments can be added after prior seperated by space(s)\n");
+    printf("- Comments can be added after prior separated by space(s)\n");
 		exit(-1);
 	}
 


=====================================
README.md
=====================================
@@ -12,6 +12,7 @@ Table of Contents
 * [Compilation & Installation](#compilation)
 * [Usage](#usage)
     * [Build RSEM references using RefSeq, Ensembl, or GENCODE annotations](#built)
+    * [Build RSEM references for untypical organisms](#untypical)
 * [Example](#example-main)
 * [Simulation](#simulation)
 * [Generate Transcript-to-Gene-Map from Trinity Output](#gen_trinity)
@@ -216,6 +217,20 @@ rsem-prepare-reference --gtf gencode.v24.annotation.gtf \
 Similar to Ensembl annotation, if you want to use GFF3 files (not
 recommended), add option `--gff3-RNA-patterns transcript`.
 
+#### <a name="untypical"></a> Build RSEM references for untypical organisms
+
+For untypical organisms, such as viruses, you may only have a GFF3 file that containing only genes but not any transcripts. You need to turn on `--gff3-genes-as-transcripts` so that RSEM will make each gene as a unique transcript.
+
+Here is an example command:
+
+```
+rsem-prepare-reference --gff3 virus.gff \
+               --gff3-genes-as-transcripts \
+               --bowtie \
+               virus.genome.fa \
+               ref/virus
+```
+
 ### II. Calculating Expression Values
 
 To calculate expression values, you should run the


=====================================
convert-sam-for-rsem
=====================================
@@ -56,11 +56,7 @@ __END__
 
 =head1 NAME
 
-convert-sam-for-rsem
-
-=head1 PURPOSE
-
-Make a RSEM compatible BAM file.
+convert-sam-for-rsem - Make a RSEM compatible BAM file.
 
 =head1 SYNOPSIS
 


=====================================
pRSEM/File.py
=====================================
@@ -10,7 +10,7 @@ class File:
     self.fullname = None  ## file's full name, include dir, base, and all ext
     self.is_gz    = None  ## if file is gzipped
     self.dirname  = None  ## directory name
-    self.basename = None  ## base name sans all extension seperated by dot
+    self.basename = None  ## base name sans all extension separated by dot
     self.filename_sans_ext = None ## no path, no last extension sep by dot
 
 


=====================================
rsem-calculate-expression
=====================================
@@ -774,11 +774,7 @@ __END__
 
 =head1 NAME
 
-rsem-calculate-expression
-
-=head1 PURPOSE
-
-Estimate gene and isoform expression from RNA-Seq data.
+rsem-calculate-expression - Estimate gene and isoform expression from RNA-Seq data.
 
 =head1 SYNOPSIS
 
@@ -1125,7 +1121,7 @@ Using a logistic regression to combine TSS signals from multiple complementary d
 
 =back
 
-Parameters for all the above models are learned from a training set. For detailed explainations, please see prior-enhanced RSEM's paper. (Default: 'pk')
+Parameters for all the above models are learned from a training set. For detailed explanations, please see prior-enhanced RSEM's paper. (Default: 'pk')
 
 =back
 


=====================================
rsem-control-fdr
=====================================
@@ -62,11 +62,7 @@ __END__
 
 =head1 NAME
 
-rsem-control-fdr
-
-=head1 PURPOSE
-
-Filter EBSeq output for statistical significance.
+rsem-control-fdr - Filter EBSeq output for statistical significance.
 
 =head1 SYNOPSIS
 


=====================================
rsem-generate-ngvector
=====================================
@@ -33,11 +33,7 @@ __END__
 
 =head1 NAME
 
-rsem-generate-ngvector
-
-=head1 PURPOSE
-
-Create Ng vector for EBSeq based only on transcript sequences.
+rsem-generate-ngvector - Create Ng vector for EBSeq based only on transcript sequences.
 
 =head1 SYNOPSIS
 


=====================================
rsem-gff3-to-gtf
=====================================
@@ -160,6 +160,13 @@ class Transcript:
 		self.index += 1
 		return interval
 
+	def __next__(self):
+		if self.index == len(self.results):
+			raise StopIteration
+		interval = self.results[self.index]
+		self.index += 1
+		return interval
+
 
 def getTranscript(tid, feature):
 	assert tid != None
@@ -217,6 +224,7 @@ def flush_out(fout):
 parser = HelpOnErrorParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, description = "Convert GFF3 files to GTF files.")
 parser.add_argument("input_GFF3_file", help = "Input GFF3 file.")
 parser.add_argument("output_GTF_file", help = "Output GTF file.")
+parser.add_argument("--make-genes-as-transcripts", help = "GFF3 file does not contain transcripts, make each gene as a transcript.", action = "store_true")
 parser.add_argument("--RNA-patterns", help = "Types of RNAs to be extracted, e.g. mRNA,rRNA", metavar = "<patterns>")
 parser.add_argument("--extract-sequences", help = "If GFF3 file contains reference sequences, extract them to the specified file", metavar = "<output.fa>")
 args = parser.parse_args()
@@ -274,11 +282,16 @@ with open(args.input_GFF3_file) as fin:
 			my_assert(gid not in gid2gname, 
 				"Gene {0} appears multiple times! Last occurrence is at line {1}:\n{2}".format(gid, feature.line_no, feature.line))
 			gid2gname[gid] = feature.getAttribute("Name")
-		elif feature.feature_type == "transcript":
+
+			if args.make_genes_as_transcripts:
+				feature.feature_type = feature.original_type = "transcript"
+				feature.attribute_dict["Parent"] = [feature.attribute_dict["ID"]]
+
+		if feature.feature_type == "transcript":
 			transcript = getTranscript(feature.getAttribute("ID", True), feature)
 			transcript.setTranscript(feature)
-		else:
-			assert feature.feature_type == "exon"
+
+		if feature.feature_type == "exon":
 			for parent in feature.getAttribute("Parent", True):
 				transcript = getTranscript(parent, feature)
 				transcript.addExon(feature)


=====================================
rsem-plot-transcript-wiggles
=====================================
@@ -81,11 +81,7 @@ __END__
 
 =head1 NAME
 
-rsem-plot-transcript-wiggles
-
-=head1 PURPOSE
-
-Generate PDF wiggle plots from transcript or gene ids
+rsem-plot-transcript-wiggles - Generate PDF wiggle plots from transcript or gene ids
 
 =head1 SYNOPSIS
 
@@ -149,7 +145,7 @@ This program generates transcript wiggle plots and outputs them in a pdf file. T
 
 =item B<output_plot_file>
 
-This is a pdf file containing all plots generated. If a list of transcript ids is provided, each page display at most 6 plots in 3 rows and 2 columns. If gene ids are provided, each page display a gene. The gene's id is showed at the top and all its transcripts' wiggle plots are showed in this page. The arrangment of plots is determined automatically. For each transcript wiggle plot, the transcript id is displayed as title. x-axis is position in the transcript and y-axis is read depth. If allele-specific expression is calculated, the basin unit becomes an allele-specific transcript and transcript ids and gene ids can be used to group allele-specific transcripts.
+This is a pdf file containing all plots generated. If a list of transcript ids is provided, each page display at most 6 plots in 3 rows and 2 columns. If gene ids are provided, each page display a gene. The gene's id is showed at the top and all its transcripts' wiggle plots are showed in this page. The arrangement of plots is determined automatically. For each transcript wiggle plot, the transcript id is displayed as title. x-axis is position in the transcript and y-axis is read depth. If allele-specific expression is calculated, the basin unit becomes an allele-specific transcript and transcript ids and gene ids can be used to group allele-specific transcripts.
 
 =item B<sample_name.transcript.sorted.bam and sample_name.transcript.readdepth>
 


=====================================
rsem-prepare-reference
=====================================
@@ -18,6 +18,7 @@ my $status;
 my $gtfF = "";
 my $gff3F = "";
 my $gff3_RNA_patterns = "";
+my $gff3_genes_as_transcripts = 0;
 my $gtf_sources = "None";
 my $mappingF = "";
 my $polyAChoice = 1; # 0, --polyA, add polyA tails for all isoforms; 1, default, no polyA tails; 2, --no-polyA-subset
@@ -43,6 +44,7 @@ my $star_sjdboverhang = 100;
 GetOptions("gtf=s" => \$gtfF,
 	   "gff3=s" => \$gff3F,
 	   "gff3-RNA-patterns=s" => \$gff3_RNA_patterns,
+     "gff3-genes-as-transcripts" => \$gff3_genes_as_transcripts,
 	   "trusted-sources=s" => \$gtf_sources,
 	   "transcript-to-gene-map=s" => \$mappingF,
 	   "allele-to-gene-map=s" => \$alleleMappingF,
@@ -116,6 +118,9 @@ if ($gff3F ne "") {
     if ($gff3_RNA_patterns ne "") {
 	$command .= " --RNA-patterns $gff3_RNA_patterns";
     }
+    if ($gff3_genes_as_transcripts) {
+      $command .= " --make-genes-as-transcripts";
+    }
     $command .= " $gff3F $gtfF";
     &runCommand($command)
 }
@@ -156,6 +161,7 @@ if ($bowtie) {
 
 if ($bowtie2) { 
     $command = $bowtie2_path."bowtie2-build -f";
+    if ($star_nthreads > 1) { $command .= " --threads $star_nthreads"; }
     if ($quiet) { $command .= " -q"; }
     $command .= " $ARGV[1].idx.fa $ARGV[1]";
     
@@ -219,11 +225,7 @@ __END__
 
 =head1 NAME
 
-rsem-prepare-reference
-
-=head1 PURPOSE
-
-Prepare transcript references for RSEM and optionally build BOWTIE/BOWTIE2/STAR indices.
+rsem-prepare-reference - Prepare transcript references for RSEM and optionally build BOWTIE/BOWTIE2/STAR indices.
 
 =head1 SYNOPSIS
 
@@ -263,6 +265,10 @@ The annotation file is in GFF3 format instead of GTF format. RSEM will first con
 
 <pattern> is a comma-separated list of transcript categories, e.g. "mRNA,rRNA". Only transcripts that match the <pattern> will be extracted. (Default: "mRNA")
 
+=item B<--gff3-genes-as-transcripts>
+
+This option is designed for untypical organisms, such as viruses, whose GFF3 files only contain genes. RSEM will assume each gene as a unique transcript when it converts the GFF3 file into GTF format.
+
 =item B<--trusted-sources> <sources>
 
 <sources> is a comma-separated list of trusted sources, e.g. "ENSEMBL,HAVANA". Only transcripts coming from these sources will be extracted. If this option is off, all sources are accepted. (Default: off)
@@ -329,7 +335,7 @@ Build STAR indices. (Default: off)
 
 =item B<--star-path> <path>
 
-The path to STAR's executable. (Default: the path to STAR executable is assumed to be in user's PATH environment varaible)
+The path to STAR's executable. (Default: the path to STAR executable is assumed to be in user's PATH environment variable)
 
 =item B<--star-sjdboverhang> <int>
 


=====================================
rsem-run-ebseq
=====================================
@@ -38,11 +38,7 @@ __END__
 
 =head1 NAME
 
-rsem-run-ebseq
-
-=head1 PURPOSE
-
-Wrapper for EBSeq to perform differential expression analysis.
+rsem-run-ebseq - Wrapper for EBSeq to perform differential expression analysis.
 
 =head1 SYNOPSIS
 


=====================================
rsem_perl_utils.pm
=====================================
@@ -9,7 +9,7 @@ our @ISA = qw(Exporter);
 our @EXPORT = qw(runCommand);
 our @EXPORT_OK = qw(runCommand collectResults showVersionInfo getSAMTOOLS hasPolyA);
 
-my $version = "RSEM v1.2.31"; # Update version info here
+my $version = "RSEM v1.3.1"; # Update version info here
 my $samtools = "samtools-1.3"; # If update to another version of SAMtools, need to change this
 
 # command, {err_msg}



View it on GitLab: https://salsa.debian.org/med-team/rsem/commit/2dfc156de286a9d83e86487a4b372aba0f2ac6f2

-- 
View it on GitLab: https://salsa.debian.org/med-team/rsem/commit/2dfc156de286a9d83e86487a4b372aba0f2ac6f2
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20180926/6724b9c8/attachment-0001.html>