[med-svn] [rsem] 02/04: Imported Upstream version 1.2.26+dfsg
Michael Crusoe
misterc-guest at moszumanska.debian.org
Sun Jan 31 11:33:21 UTC 2016
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to branch master
in repository rsem.
commit 1e2dc5ac0da77393e6bbb959c16c876804e7dde6
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date: Sun Jan 31 03:19:52 2016 -0800
Imported Upstream version 1.2.26+dfsg
---
EBSeq/Makefile | 2 +-
EBSeq/install | 15 +-
EM.cpp | 13 +-
GTFItem.h | 270 ++++++++++++++++++++---------------
Gibbs.cpp | 63 ++++----
README.md | 241 ++++++++++++++++++++++---------
Transcript.h | 41 ++++--
WHAT_IS_NEW | 26 ++++
WriteResults.h | 27 +++-
calcCI.cpp | 92 ++++++++----
extractRef.cpp | 52 +++++--
rsem-calculate-expression | 36 ++++-
rsem-gen-transcript-plots | 11 +-
rsem-gff3-to-gtf | 105 ++++++++++++++
rsem-plot-model | 34 ++++-
rsem-prepare-reference | 53 +++++--
rsem-refseq-extract-primary-assembly | 18 +++
rsem-run-ebseq | 8 +-
rsem_perl_utils.pm | 8 +-
19 files changed, 797 insertions(+), 318 deletions(-)
diff --git a/EBSeq/Makefile b/EBSeq/Makefile
index 78b6d60..20a3559 100644
--- a/EBSeq/Makefile
+++ b/EBSeq/Makefile
@@ -12,4 +12,4 @@ rsem-for-ebseq-calculate-clustering-info : calcClusteringInfo.cpp
$(CC) -O3 -Wall calcClusteringInfo.cpp -o $@
clean :
- rm -rf blockmodeling $(PROGRAMS) *~ BiocInstaller
+ rm -rf blockmodeling gplots $(PROGRAMS) *~ BiocInstaller
diff --git a/EBSeq/install b/EBSeq/install
index d5e96b4..4a8b807 100755
--- a/EBSeq/install
+++ b/EBSeq/install
@@ -3,20 +3,21 @@
.libPaths(c(".", .libPaths()))
result <- suppressWarnings(tryCatch({
library("EBSeq")
- cat("EBSeq already exists.\n")
+ cat("EBSeq v", as.character(packageVersion("EBSeq")), " already exists.\n", sep = "")
}, error = function(err) {
tryCatch({
- source("http://www.bioconductor.org/biocLite.R")
+ source("http://www.bioconductor.org/biocLite.R")
+ biocLite("BiocUpgrade")
biocLite("EBSeq", lib = ".")
library("EBSeq")
- cat("The latest version of EBSeq is successfully installed from Bioconductor.\n")
+ cat("EBSeq v", as.character(packageVersion("EBSeq")), " is successfully installed from Bioconductor.\n", sep = "")
}, error = function(err) {
tryCatch({
- cat("Failed to install the latest version of EBSeq from Bioconductor! Try to install EBSeq v1.1.5 locally instead.\n")
- install.packages(c("blockmodeling_0.1.8.tar.gz", "EBSeq_1.1.5.tar.gz"), lib = ".", repos = NULL)
+ cat("Failed to install EBSeq from Bioconductor! Try to install EBSeq v1.2.0 locally instead.\n")
+ install.packages(c("blockmodeling_0.1.8.tar.gz", "gplots_2.17.0.tar.gz", "EBSeq_1.2.0.tar.gz"), lib = ".", repos = NULL, type = "source")
library("EBSeq")
- cat("EBSeq v1.1.5 is successfully installed locally.\n")
- }, error = function(err) { cat("Failed to install EBSeq v1.1.5 locally!\n") })
+ cat("EBSeq v1.2.0 is successfully installed locally.\n")
+ }, error = function(err) { cat("Failed to install EBSeq v1.2.0 locally!\n") })
})
}))
diff --git a/EM.cpp b/EM.cpp
index e58ffee..a804107 100644
--- a/EM.cpp
+++ b/EM.cpp
@@ -92,6 +92,8 @@ ModelParams mparams;
bool hasSeed;
seedType seed;
+bool appendNames;
+
template<class ReadType, class HitType, class ModelType>
void init(ReadReader<ReadType> **&readers, HitContainer<HitType> **&hitvs, double **&ncpvs, ModelType **&mhps) {
READ_INT_TYPE nReads;
@@ -279,7 +281,7 @@ template<class ModelType>
void writeResults(ModelType& model, double* counts) {
sprintf(modelF, "%s.model", statName);
model.write(modelF);
- writeResultsEM(M, refName, imdName, transcripts, theta, eel, countvs[0]);
+ writeResultsEM(M, refName, imdName, transcripts, theta, eel, countvs[0], appendNames);
}
template<class ReadType, class HitType, class ModelType>
@@ -541,7 +543,7 @@ int main(int argc, char* argv[]) {
bool quiet = false;
if (argc < 6) {
- printf("Usage : rsem-run-em refName read_type sampleName imdName statName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling] [--seed seed]\n\n");
+ printf("Usage : rsem-run-em refName read_type sampleName imdName statName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling] [--seed seed] [--append-names]\n\n");
printf(" refName: reference name\n");
printf(" read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
printf(" sampleName: sample's name, including the path\n");
@@ -550,8 +552,9 @@ int main(int argc, char* argv[]) {
printf(" -b: produce bam format output file. (default: off)\n");
printf(" -q: set it quiet\n");
printf(" --gibbs-out: generate output file used by Gibbs sampler. (default: off)\n");
- printf(" --sampling: sample each read from its posterior distribution when bam file is generated. (default: off)\n");
+ printf(" --sampling: sample each read from its posterior distribution when BAM file is generated. (default: off)\n");
printf(" --seed uint32: the seed used for the BAM sampling. (default: off)\n");
+ printf(" --append-names: append transcript_name/gene_name when available. (default: off)\n");
printf("// model parameters should be in imdName.mparams.\n");
exit(-1);
}
@@ -571,7 +574,8 @@ int main(int argc, char* argv[]) {
genGibbsOut = false;
pt_fn_list = NULL;
hasSeed = false;
-
+ appendNames = false;
+
for (int i = 6; i < argc; i++) {
if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); }
if (!strcmp(argv[i], "-b")) {
@@ -592,6 +596,7 @@ int main(int argc, char* argv[]) {
seed = 0;
for (int k = 0; k < len; k++) seed = seed * 10 + (argv[i + 1][k] - '0');
}
+ if (!strcmp(argv[i], "--append-names")) appendNames = true;
}
general_assert(nThreads > 0, "Number of threads should be bigger than 0!");
diff --git a/GTFItem.h b/GTFItem.h
index 8002f54..9dac075 100644
--- a/GTFItem.h
+++ b/GTFItem.h
@@ -1,128 +1,166 @@
-#ifndef __GTFITEM__
-#define __GTFITEM__
+/* Copyright (c) 2015
+ Bo Li (University of California, Berkeley)
+ bli25 at berkeley.edu
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ USA
+*/
+
+#ifndef GTFITEM_H_
+#define GTFITEM_H_
#include<cstdio>
+#include<cctype>
#include<cstdlib>
#include<cassert>
#include<string>
#include<sstream>
-#include "utils.h"
-
class GTFItem {
-public:
-
- GTFItem() {
- seqname = source = feature = "";
- score = "";
- start = end = 0;
- strand = 0; //strand is a char variable
- frame = "";
- gene_id = transcript_id = "";
- left = "";
- }
-
- bool operator<(const GTFItem& o) const {
- if (gene_id != o.gene_id) return gene_id < o.gene_id;
- if (transcript_id != o.transcript_id) return transcript_id < o.transcript_id;
+ public:
+
+ GTFItem() {
+ seqname = source = feature = "";
+ score = "";
+ start = end = 0;
+ strand = 0; //strand is a char variable
+ frame = "";
+ gene_id = transcript_id = "";
+ gene_name = transcript_name = "";
+ left = "";
+ }
+
+ bool operator<(const GTFItem& o) const {
+ if (gene_id != o.gene_id) return gene_id < o.gene_id;
+ if (transcript_id != o.transcript_id) return transcript_id < o.transcript_id;
return start < o.start;
- }
-
- void my_assert(char value, std::string& line, const std::string& msg) {
- if (!value) {
- fprintf(stderr, ".gtf file might be corrupted!\n");
- fprintf(stderr, "Stop at line : %s\n", line.c_str());
- fprintf(stderr, "Error Message: %s\n", msg.c_str());
- exit(-1);
- }
- }
-
- void parse(std::string& line) {
- std::istringstream strin(line);
- std::string tmp;
-
- getline(strin, seqname, '\t');
- getline(strin, source, '\t');
- getline(strin, feature, '\t');
- getline(strin, tmp, '\t');
- start = atoi(tmp.c_str());
- getline(strin, tmp, '\t');
- end = atoi(tmp.c_str());
- getline(strin, score, '\t');
- getline(strin, tmp, '\t');
- my_assert((tmp.length() == 1 && (tmp[0] == '+' || tmp[0] == '-')), line, "Strand is neither '+' nor '-'!");
- strand = tmp[0];
- getline(strin, frame, '\t');
-
- getline(strin, left); // assign attributes and possible comments into "left"
- }
-
- void parseAttributes(std::string& line) {
- std::istringstream strin(left);
- std::string tmp;
- bool find_gene_id = false, find_transcript_id = false;
-
- while (getline(strin, tmp, ';') && (!find_gene_id || !find_transcript_id)) {
- tmp = cleanStr(tmp);
- size_t pos = tmp.find(' ');
- my_assert((pos != std::string::npos), line, "Cannot separate the identifier from the value for attribute " + tmp + "!");
- std::string identifier = tmp.substr(0, pos);
-
- if (identifier == "gene_id") {
- my_assert(!find_gene_id, line, "gene_id appear more than once!");
- tmp = cleanStr(tmp.substr(pos));
- my_assert((tmp[0] == '"' && tmp[tmp.length() - 1] == '"'), line, "Textual attributes should be surrounded by doublequotes!");
- gene_id = tmp.substr(1, tmp.length() - 2);
- find_gene_id = true;
- } else if (identifier == "transcript_id") {
- my_assert(!find_transcript_id, line, "transcript_id appear more than once!");
- tmp = cleanStr(tmp.substr(pos));
- my_assert((tmp[0] == '"' && tmp[tmp.length() - 1] == '"'), line, "Textual attributes should be surrounded by doublequotes!");
- transcript_id = tmp.substr(1, tmp.length() - 2);
- find_transcript_id = true;
- }
- }
-
- my_assert(feature != "exon" || find_gene_id, line, "Cannot find gene_id!");
- my_assert(feature != "exon" || find_transcript_id, line, "Cannot find transcript_id!");
- if (!find_gene_id && feature != "exon") { printf("Warning: line \" %s \" does not contain a gene_id attribute! Since this line will not be used for reference construction, it is skipped. But if you think this GTF file is corrupted, you should find a complelete GTF file instead and rebuild the reference.\n", line.c_str()); }
- if (!find_transcript_id && feature != "exon") { printf("Warning: line \" %s \" does not contain a transcript_id attribute! Since this line will not be used for reference construction, it is skipped. But if you think this GTF file is corrupted, you should find a complelete GTF file instead and rebuild the reference.\n", line.c_str()); }
- }
-
- std::string getSeqName() { return seqname; }
- std::string getSource() { return source; }
- std::string getFeature() { return feature; }
- int getStart() { return start; }
- int getEnd() { return end; }
- char getStrand() { return strand; }
- std::string getScore() { return score; } // float, integer or "." ; let downstream programs parse it
- std::string getFrame() { return frame; } // 0, 1, 2, or "."; let downstream programs parse it
- std::string getGeneID() { return gene_id; }
- std::string getTranscriptID() { return transcript_id; }
- std::string getLeft() { return left; }
-
- void setGeneID(const std::string& gene_id) {
- this->gene_id = gene_id;
- }
-
- std::string toString() {
- std::string val;
- std::ostringstream strout;
- strout<<seqname<<'\t'<<source<<'\t'<<feature<<'\t'<<start<<'\t'<<end<<'\t'<<score<<'\t'<<strand<<'\t'<<frame<<'\t';
- strout<<"gene_id \""<<gene_id<<"\"; transcript_id \""<<transcript_id<<"\";"<<left;
- val = strout.str();
-
- return val;
- }
-
-private:
- std::string seqname, source, feature;
- std::string score;
- int start, end;
- char strand;
- std::string frame;
- std::string gene_id, transcript_id;
- std::string left;
+ }
+
+ void parse(const std::string& line) {
+ std::istringstream strin(line);
+ std::string tmp;
+
+ getline(strin, seqname, '\t');
+ getline(strin, source, '\t');
+ getline(strin, feature, '\t');
+ getline(strin, tmp, '\t');
+ start = atoi(tmp.c_str());
+ getline(strin, tmp, '\t');
+ end = atoi(tmp.c_str());
+ getline(strin, score, '\t');
+ getline(strin, tmp, '\t');
+ gtf_assert((tmp.length() == 1 && (tmp[0] == '+' || tmp[0] == '-')), line, "Strand is neither '+' nor '-'!");
+ strand = tmp[0];
+ getline(strin, frame, '\t');
+
+ getline(strin, left); // assign attributes and possible comments into "left"
+ }
+
+ void parseAttributes(const std::string& line) {
+ assert(feature == "exon");
+ gene_id = transcript_id = "";
+ gene_name = transcript_name = "";
+
+ std::istringstream strin(left);
+ std::string attribute, identifier;
+ int nleft = 4;
+
+ while (getline(strin, attribute, ';') && nleft > 0 && !strin.eof()) {
+ size_t pos = 0, rpos, len = attribute.length();
+ // locate identifier
+ while (pos < len && isspace(attribute[pos])) ++pos;
+ rpos = pos + 1;
+ while (rpos < len && !isspace(attribute[rpos])) ++rpos;
+ gtf_assert(pos < len && rpos < len, line, "Cannot separate the identifier from the value for attribute " + attribute + "!");
+ identifier = attribute.substr(pos, rpos - pos);
+ // locate value
+ pos = rpos + 1;
+ while (pos < len && attribute[pos] != '"') ++pos;
+ rpos = len - 1;
+ while (rpos > pos && attribute[rpos] != '"') --rpos;
+
+ if (identifier == "gene_id") {
+ gtf_assert(gene_id == "", line, "gene_id appear more than once!");
+ gtf_assert(pos < len && rpos > pos, line, "Attribute " + identifier + "'s value should be surrounded by double quotes!");
+ gtf_assert(pos + 1 < rpos, line, "gene_id cannot be empty!");
+ gene_id = attribute.substr(pos + 1, rpos - pos - 1);
+ --nleft;
+ }
+ else if (identifier == "transcript_id") {
+ gtf_assert(transcript_id == "", line, "transcript_id appear more than once!");
+ gtf_assert(pos < len && rpos > pos, line, "Attribute " + identifier + "'s value should be surrounded by double quotes!");
+ gtf_assert(pos + 1 < rpos, line, "transcript_id cannot be empty!");
+ transcript_id = attribute.substr(pos + 1, rpos - pos - 1);
+ --nleft;
+ }
+ else if (identifier == "gene_name" && gene_name == "" && (pos < len && rpos > pos + 1)) {
+ gene_name = attribute.substr(pos + 1, rpos - pos - 1);
+ --nleft;
+ }
+ else if (identifier == "transcript_name" && transcript_name == "" && (pos < len && rpos > pos + 1)) {
+ transcript_name = attribute.substr(pos + 1, rpos - pos - 1);
+ --nleft;
+ }
+ }
+
+ gtf_assert(gene_id != "", line, "Cannot find gene_id!");
+ gtf_assert(transcript_id != "", line, "Cannot find transcript_id!");
+ }
+
+ const std::string& getSeqName() const { return seqname; }
+ const std::string& getSource() const { return source; }
+ const std::string getFeature() const { return feature; }
+ int getStart() const { return start; }
+ int getEnd() const { return end; }
+ char getStrand() const { return strand; }
+ const std::string& getScore() const { return score; } // float, integer or "." ; let downstream programs parse it
+ const std::string& getFrame() const { return frame; } // 0, 1, 2, or "."; let downstream programs parse it
+ const std::string& getGeneID() const { return gene_id; }
+ const std::string& getTranscriptID() const { return transcript_id; }
+ const std::string& getGeneName() const { return gene_name; }
+ const std::string& getTranscriptName() const { return transcript_name; }
+ const std::string getLeft() { return left; }
+
+ void setGeneID(const std::string& gene_id) {
+ this->gene_id = gene_id;
+ }
+
+ std::string toString() {
+ std::ostringstream strout("");
+ strout<< seqname<< '\t'<< source<< '\t'<< feature<< '\t'<< start<< '\t'<< end<< '\t'<< score<< '\t'<< strand<< '\t'<< frame<< '\t'<< left;
+ return strout.str();
+ }
+
+ private:
+ std::string seqname, source, feature;
+ std::string score;
+ int start, end;
+ char strand;
+ std::string frame;
+ std::string gene_id, transcript_id;
+ std::string gene_name, transcript_name;
+ std::string left;
+
+ void gtf_assert(char value, const std::string& line, const std::string& msg) {
+ if (!value) {
+ fprintf(stderr, "The GTF file might be corrupted!\n");
+ fprintf(stderr, "Stop at line : %s\n", line.c_str());
+ fprintf(stderr, "Error Message: %s\n", msg.c_str());
+ exit(-1);
+ }
+ }
};
#endif
diff --git a/Gibbs.cpp b/Gibbs.cpp
index 7563dd4..223e2ff 100644
--- a/Gibbs.cpp
+++ b/Gibbs.cpp
@@ -64,7 +64,8 @@ vector<Item> hits;
vector<double> eel;
double *mw;
-vector<int> pseudo_counts;
+vector<int> init_counts;
+double pseudoC;
vector<double> pme_c, pve_c; //global posterior mean and variance vectors on counts
vector<double> pme_tpm, pme_fpkm;
@@ -122,8 +123,6 @@ void load_data(char* refName, char* statName, char* imdName) {
N1 = s.size() - 1;
nHits = hits.size();
- totc = N0 + N1 + (M + 1);
-
if (verbose) { printf("Loading data is finished!\n"); }
}
@@ -139,15 +138,22 @@ void load_group_info(char* refName) {
if (verbose) { printf("Loading group information is finished!\n"); }
}
-// Load imdName.omit and initialize the pseudo count vector.
+// Load imdName.omit and initialize the init count vector.
void load_omit_info(const char* imdName) {
char omitF[STRLEN];
- sprintf(omitF, "%s.omit", imdName);
- FILE *fi = fopen(omitF, "r");
- pseudo_counts.assign(M + 1, 1);
+ FILE *fi = NULL;
int tid;
- while (fscanf(fi, "%d", &tid) == 1) pseudo_counts[tid] = 0;
+
+ sprintf(omitF, "%s.omit", imdName);
+ fi = fopen(omitF, "r");
+ init_counts.assign(M + 1, 0);
+ totc = M + 1;
+ while (fscanf(fi, "%d", &tid) == 1) {
+ init_counts[tid] = -1;
+ --totc;
+ }
fclose(fi);
+ totc = totc * pseudoC + N0 + N1;
}
template<class ModelType>
@@ -209,22 +215,6 @@ void init() {
if (verbose) { printf("Initialization finished!\n"); }
}
-//sample theta from Dir(1)
-void sampleTheta(engine_type& engine, vector<double>& theta) {
- gamma_dist gm(1);
- gamma_generator gmg(engine, gm);
- double denom;
-
- theta.assign(M + 1, 0);
- denom = 0.0;
- for (int i = 0; i <= M; i++) {
- theta[i] = (pseudo_counts[i] > 0 ? gmg() : 0.0);
- denom += theta[i];
- }
- assert(denom > EPSILON);
- for (int i = 0; i <= M; i++) theta[i] /= denom;
-}
-
void writeCountVector(FILE* fo, vector<int>& counts) {
for (int i = 0; i < M; i++) {
fprintf(fo, "%d ", counts[i]);
@@ -238,14 +228,13 @@ void* Gibbs(void* arg) {
Params *params = (Params*)arg;
vector<double> theta, tpm, fpkm;
- vector<int> z, counts(pseudo_counts);
+ vector<int> z, counts(init_counts);
vector<double> arr;
uniform_01_generator rg(*params->engine, uniform_01_dist());
// generate initial state
- sampleTheta(*params->engine, theta);
-
+ theta.assign(M + 1, 0.0);
z.assign(N1, 0);
counts[0] += N0;
@@ -254,7 +243,7 @@ void* Gibbs(void* arg) {
len = to - fr;
arr.assign(len, 0);
for (HIT_INT_TYPE j = fr; j < to; j++) {
- arr[j - fr] = theta[hits[j].sid] * hits[j].conprb;
+ arr[j - fr] = hits[j].conprb;
if (j > fr) arr[j - fr] += arr[j - fr - 1]; // cumulative
}
z[i] = hits[fr + sample(rg, arr, len)].sid;
@@ -270,8 +259,8 @@ void* Gibbs(void* arg) {
fr = s[i]; to = s[i + 1]; len = to - fr;
arr.assign(len, 0);
for (HIT_INT_TYPE j = fr; j < to; j++) {
- arr[j - fr] = counts[hits[j].sid] * hits[j].conprb;
- if (j > fr) arr[j - fr] += arr[j - fr - 1]; //cumulative
+ arr[j - fr] = (counts[hits[j].sid] + pseudoC) * hits[j].conprb;
+ if (j > fr) arr[j - fr] += arr[j - fr - 1]; //cumulative
}
z[i] = hits[fr + sample(rg, arr, len)].sid;
++counts[z[i]];
@@ -280,12 +269,12 @@ void* Gibbs(void* arg) {
if (ROUND > BURNIN) {
if ((ROUND - BURNIN - 1) % GAP == 0) {
writeCountVector(params->fo, counts);
- for (int i = 0; i <= M; i++) theta[i] = counts[i] / totc;
+ for (int i = 0; i <= M; i++) theta[i] = (counts[i] < 0 ? 0.0 : (counts[i] + pseudoC) / totc);
polishTheta(M, theta, eel, mw);
calcExpressionValues(M, theta, eel, tpm, fpkm);
for (int i = 0; i <= M; i++) {
- params->pme_c[i] += counts[i] - pseudo_counts[i];
- params->pve_c[i] += double(counts[i] - pseudo_counts[i]) * (counts[i] - pseudo_counts[i]);
+ params->pme_c[i] += counts[i];
+ params->pve_c[i] += double(counts[i]) * counts[i];
params->pme_tpm[i] += tpm[i];
params->pme_fpkm[i] += fpkm[i];
}
@@ -293,7 +282,7 @@ void* Gibbs(void* arg) {
for (int i = 0; i < m; i++) {
int b = gi.spAt(i), e = gi.spAt(i + 1);
double count = 0.0;
- for (int j = b; j < e; j++) count += counts[j] - pseudo_counts[j];
+ for (int j = b; j < e; j++) count += counts[j];
params->pve_c_genes[i] += count * count;
}
@@ -301,7 +290,7 @@ void* Gibbs(void* arg) {
for (int i = 0; i < m_trans; i++) {
int b = ta.spAt(i), e = ta.spAt(i + 1);
double count = 0.0;
- for (int j = b; j < e; j++) count += counts[j] - pseudo_counts[j];
+ for (int j = b; j < e; j++) count += counts[j];
params->pve_c_trans[i] += count * count;
}
}
@@ -385,7 +374,7 @@ void release() {
int main(int argc, char* argv[]) {
if (argc < 7) {
- printf("Usage: rsem-run-gibbs reference_name imdName statName BURNIN NSAMPLES GAP [-p #Threads] [--seed seed] [-q]\n");
+ printf("Usage: rsem-run-gibbs reference_name imdName statName BURNIN NSAMPLES GAP [-p #Threads] [--seed seed] [--pseudo-count pseudo_count] [-q]\n");
exit(-1);
}
@@ -399,6 +388,7 @@ int main(int argc, char* argv[]) {
nThreads = 1;
hasSeed = false;
+ pseudoC = 1.0;
quiet = false;
for (int i = 7; i < argc; i++) {
@@ -409,6 +399,7 @@ int main(int argc, char* argv[]) {
seed = 0;
for (int k = 0; k < len; k++) seed = seed * 10 + (argv[i + 1][k] - '0');
}
+ if (!strcmp(argv[i], "--pseudo-count")) pseudoC = atof(argv[i + 1]);
if (!strcmp(argv[i], "-q")) quiet = true;
}
verbose = !quiet;
diff --git a/README.md b/README.md
index 7aa1112..6129200 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@ Table of Contents
* [Introduction](#introduction)
* [Compilation & Installation](#compilation)
* [Usage](#usage)
+ * [Build RSEM references using RefSeq, Ensembl, or GENCODE annotations](#built)
* [Example](#example)
* [Simulation](#simulation)
* [Generate Transcript-to-Gene-Map from Trinity Output](#gen_trinity)
@@ -47,7 +48,7 @@ To compile RSEM, simply run
make
For cygwin users, please uncomment the 3rd and 7th line in
-'sam/Makefile' before you run 'make'.
+`sam/Makefile` before you run `make`.
To compile EBSeq, which is included in the RSEM package, run
@@ -57,18 +58,22 @@ To install, simply put the rsem directory in your environment's PATH
variable.
If you prefer to put all RSEM executables to a bin directory, please
-also remember to put 'rsem_perl_utils.pm' and 'WHAT_IS_NEW' to the
-same bin directory. 'rsem_perl_utils.pm' is required for most RSEM's
-perl scripts and 'WHAT_IS_NEW' contains the RSEM version information.
+also remember to put `rsem_perl_utils.pm` and `WHAT_IS_NEW` to the
+same bin directory. `rsem_perl_utils.pm` is required for most RSEM's
+perl scripts and `WHAT_IS_NEW` contains the RSEM version information.
### Prerequisites
C++, Perl and R are required to be installed.
-To take advantage of RSEM's built-in support for the Bowtie/Bowtie 2
-alignment program, you must have
-[Bowtie](http://bowtie-bio.sourceforge.net) and/or [Bowtie
-2](http://bowtie-bio.sourceforge.net/bowtie2) installed.
+To use the `--gff3` option of `rsem-prepare-reference`, Python is also
+required to be installed.
+
+To take advantage of RSEM's built-in support for the Bowtie/Bowtie
+2/STAR alignment program, you must have
+[Bowtie](http://bowtie-bio.sourceforge.net)/[Bowtie
+2](http://bowtie-bio.sourceforge.net/bowtie2)/[STAR](https://github.com/alexdobin/STAR)
+installed.
## <a name="usage"></a> Usage
@@ -84,17 +89,123 @@ UCSC Genes annotation track, this information can be recovered by
downloading the knownIsoforms.txt file for the appropriate genome.
To prepare the reference sequences, you should run the
-'rsem-prepare-reference' program. Run
+`rsem-prepare-reference` program. Run
rsem-prepare-reference --help
to get usage information or visit the [rsem-prepare-reference
documentation page](rsem-prepare-reference.html).
+#### <a name="built"></a> Build RSEM references using RefSeq, Ensembl, or GENCODE annotations
+
+RefSeq and Ensembl are two frequently used annotations. For human and
+mouse, GENCODE annotaions are also available. In this section, we show
+how to build RSEM references using these annotations. Note that it is
+important to pair the genome with the annotation file for each
+annotation source. In addition, we recommend users to use the primary
+assemblies of genomes. Without loss of generality, we use human genome as
+an example and in addition build Bowtie indices.
+
+For **RefSeq**, the genome and annotation file in GFF3 format can be found
+at RefSeq genomes FTP:
+
+```
+ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/
+```
+
+For example, the human genome and GFF3 file locate at the subdirectory
+`vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.31_GRCh38.p5`. `GCF_000001405.31_GRCh38.p5`
+is the latest annotation version when this section was written.
+
+Download and decompress the genome and annotation files to your working directory:
+
+```
+ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.31_GRCh38.p5/GCF_000001405.31_GRCh38.p5_genomic.fna.gz
+ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.31_GRCh38.p5/GCF_000001405.31_GRCh38.p5_genomic.gff.gz
+```
+
+`GCF_000001405.31_GRCh38.p5_genomic.fna` contains all top level
+sequences, including patches and haplotypes. To obtain the primary
+assembly, run the following RSEM python script:
+
+```
+rsem-refseq-extract-primary-assembly GCF_000001405.31_GRCh38.p5_genomic.fna GCF_000001405.31_GRCh38.p5_genomic.primary_assembly.fna
+```
+
+Then type the following command to build RSEM references:
+
+```
+rsem-prepare-reference --gff3 GCF_000001405.31_GRCh38.p5_genomic.gff \
+ --trusted-sources BestRefSeq,Curated\ Genomic \
+ --bowtie \
+ GCF_000001405.31_GRCh38.p5_genomic.primary_assembly.fna \
+ ref/human_refseq
+```
+
+In the above command, `--trusted-sources` tells RSEM to only extract
+transcripts from RefSeq sources like `BestRefSeq` or `Curated Genomic`. By
+default, RSEM trust all sources. There is also an
+`--gff3-RNA-patterns` option and its default is `mRNA`. Setting
+`--gff3-RNA-patterns mRNA,rRNA` will allow RSEM to extract all mRNAs
+and rRNAs from the genome. Visit [here](rsem-prepare-reference.html)
+for more details.
+
+Because the gene and transcript IDs (e.g. gene1000, rna28655)
+extracted from RefSeq GFF3 files are hard to understand, it is
+recommended to turn on the `--append-names` option in
+`rsem-calculate-expression` for better interpretation of
+quantification results.
+
+For **Ensembl**, the genome and annotation files can be found at
+[Ensembl FTP](http://uswest.ensembl.org/info/data/ftp/index.html).
+
+Download and decompress the human genome and GTF files:
+
+```
+ftp://ftp.ensembl.org/pub/release-83/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
+ftp://ftp.ensembl.org/pub/release-83/gtf/homo_sapiens/Homo_sapiens.GRCh38.83.gtf.gz
+```
+
+Then use the following command to build RSEM references:
+
+```
+rsem-prepare-reference --gtf Homo_sapiens.GRCh38.83.gtf \
+ --bowtie \
+ Homo_sapiens.GRCh38.dna.primary_assembly.fa \
+ ref/human_ensembl
+```
+
+If you want to use GFF3 file instead, which is unnecessary and not
+recommended, you should add option `--gff3-RNA-patterns transcript`
+because `mRNA` is replaced by `transcript` in Ensembl GFF3 files.
+
+**GENCODE** only provides human and mouse annotations. The genome and
+ annotation files can be found from [GENCODE
+ website](http://www.gencodegenes.org/).
+
+Download and decompress the human genome and GTF files:
+
+```
+ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/GRCh38.primary_assembly.genome.fa.gz
+ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.annotation.gtf.gz
+```
+
+Then type the following command:
+
+```
+rsem-prepare-reference --gtf gencode.v24.annotation.gtf \
+ --bowtie \
+ GRCh38.primary_assembly.genome.fa \
+ ref/human_gencode
+```
+
+Similar to Ensembl annotation, if you want to use GFF3 files (not
+recommended), add option `--gff3-RNA-patterns transcript`.
+
### II. Calculating Expression Values
To calculate expression values, you should run the
-'rsem-calculate-expression' program. Run
+`rsem-calculate-expression` program. Run
rsem-calculate-expression --help
@@ -104,8 +215,8 @@ documentation page](rsem-calculate-expression.html).
#### Calculating expression values from single-end data
For single-end models, users have the option of providing a fragment
-length distribution via the '--fragment-length-mean' and
-'--fragment-length-sd' options. The specification of an accurate fragment
+length distribution via the `--fragment-length-mean` and
+`--fragment-length-sd` options. The specification of an accurate fragment
length distribution is important for the accuracy of expression level
estimates from single-end data. If the fragment length mean and sd are
not provided, RSEM will not take a fragment length distribution into
@@ -114,19 +225,19 @@ consideration.
#### Using an alternative aligner
By default, RSEM automates the alignment of reads to reference
-transcripts using the Bowtie aligner. Turn on '--bowtie2' for
-'rsem-prepare-reference' and 'rsem-calculate-expression' will allow
+transcripts using the Bowtie aligner. Turn on `--bowtie2` for
+`rsem-prepare-reference` and `rsem-calculate-expression` will allow
RSEM to use the Bowtie 2 alignment program instead. Please note that
indel alignments, local alignments and discordant alignments are
disallowed when RSEM uses Bowtie 2 since RSEM currently cannot handle
-them. See the description of '--bowtie2' option in
-'rsem-calculate-expression' for more details. Similarly, turn on
-'--star' will allow RSEM to use the STAR aligner. To use an
+them. See the description of `--bowtie2` option in
+`rsem-calculate-expression` for more details. Similarly, turn on
+`--star` will allow RSEM to use the STAR aligner. To use an
alternative alignment program, align the input reads against the file
-'reference_name.idx.fa' generated by 'rsem-prepare-reference', and
+`reference_name.idx.fa` generated by `rsem-prepare-reference`, and
format the alignment output in SAM or BAM format. Then, instead of
-providing reads to 'rsem-calculate-expression', specify the '--sam' or
-'--bam' option and provide the SAM or BAM file as an argument.
+providing reads to `rsem-calculate-expression`, specify the `--sam` or
+`--bam` option and provide the SAM or BAM file as an argument.
RSEM requires the alignments of a read to be adjacent. For
paired-end reads, RSEM also requires the two mates of any alignment be
@@ -136,7 +247,7 @@ please run
rsem-sam-validator <input.sam/input.bam>
If your file does not satisfy the requirements, you can use
-'convert-sam-for-rsem' to convert it into a BAM file which RSEM can
+`convert-sam-for-rsem` to convert it into a BAM file which RSEM can
process. Please run
convert-sam-for-rsem --help
@@ -148,29 +259,29 @@ page](convert-sam-for-rsem.html).
However, please note that RSEM does ** not ** support gapped
alignments. So make sure that your aligner does not produce alignments
with intersions/deletions. Also, please make sure that you use
-'reference_name.idx.fa' , which is generated by RSEM, to build your
+`reference_name.idx.fa` , which is generated by RSEM, to build your
aligner's indices.
### III. Visualization
-RSEM contains a version of samtools in the 'sam' subdirectory. RSEM
-will always produce three files:'sample_name.transcript.bam', the
-unsorted BAM file, 'sample_name.transcript.sorted.bam' and
-'sample_name.transcript.sorted.bam.bai' the sorted BAM file and
+RSEM contains a version of samtools in the `sam` subdirectory. RSEM
+will always produce three files:`sample_name.transcript.bam`, the
+unsorted BAM file, `sample_name.transcript.sorted.bam` and
+`sample_name.transcript.sorted.bam.bai` the sorted BAM file and
indices generated by the samtools included. All three files are in
transcript coordinates. When users specify the --output-genome-bam
-option RSEM will produce three files: 'sample_name.genome.bam', the
-unsorted BAM file, 'sample_name.genome.sorted.bam' and
-'sample_name.genome.sorted.bam.bai' the sorted BAM file and indices
+option RSEM will produce three files: `sample_name.genome.bam`, the
+unsorted BAM file, `sample_name.genome.sorted.bam` and
+`sample_name.genome.sorted.bam.bai` the sorted BAM file and indices
generated by the samtools included. All these files are in genomic
coordinates.
#### a) Converting transcript BAM file into genome BAM file
-Normally, RSEM will do this for you via '--output-genome-bam' option
-of 'rsem-calculate-expression'. However, if you have run
-'rsem-prepare-reference' and use 'reference_name.idx.fa' to build
-indices for your aligner, you can use 'rsem-tbam2gbam' to convert your
+Normally, RSEM will do this for you via `--output-genome-bam` option
+of `rsem-calculate-expression`. However, if you have run
+`rsem-prepare-reference` and use `reference_name.idx.fa` to build
+indices for your aligner, you can use `rsem-tbam2gbam` to convert your
transcript coordinate BAM alignments file into a genomic coordinate
BAM alignments file without the need to run the whole RSEM
pipeline.
@@ -179,7 +290,7 @@ Usage:
rsem-tbam2gbam reference_name unsorted_transcript_bam_input genome_bam_output
-reference_name : The name of reference built by 'rsem-prepare-reference'
+reference_name : The name of reference built by `rsem-prepare-reference`
unsorted_transcript_bam_input : This file should satisfy: 1) the alignments of a same read are grouped together, 2) for any paired-end alignment, the two mates should be adjacent to each other, 3) this file should not be sorted by samtools
genome_bam_output : The output genomic coordinate BAM file's name
@@ -188,8 +299,8 @@ genome_bam_output : The output genomic coordinate BAM file's name
A wiggle plot representing the expected number of reads overlapping
each position in the genome/transcript set can be generated from the
sorted genome/transcript BAM file output. To generate the wiggle
-plot, run the 'rsem-bam2wig' program on the
-'sample_name.genome.sorted.bam'/'sample_name.transcript.sorted.bam' file.
+plot, run the `rsem-bam2wig` program on the
+`sample_name.genome.sorted.bam`/`sample_name.transcript.sorted.bam` file.
Usage:
@@ -210,7 +321,7 @@ Here are some guidance for visualizing transcript coordinate files using IGV:
1) Import the transcript sequences as a genome
-Select File -> Import Genome, then fill in ID, Name and Fasta file. Fasta file should be 'reference_name.idx.fa'. After that, click Save button. Suppose ID is filled as 'reference_name', a file called 'reference_name.genome' will be generated. Next time, we can use: File -> Load Genome, then select 'reference_name.genome'.
+Select File -> Import Genome, then fill in ID, Name and Fasta file. Fasta file should be `reference_name.idx.fa`. After that, click Save button. Suppose ID is filled as `reference_name`, a file called `reference_name.genome` will be generated. Next time, we can use: File -> Load Genome, then select `reference_name.genome`.
2) Load visualization files
@@ -221,7 +332,7 @@ Select File -> Load from File, then choose one transcript coordinate visualizati
#### d) Generating Transcript Wiggle Plots
To generate transcript wiggle plots, you should run the
-'rsem-plot-transcript-wiggles' program. Run
+`rsem-plot-transcript-wiggles` program. Run
rsem-plot-transcript-wiggles --help
@@ -230,7 +341,7 @@ documentation page](rsem-plot-transcript-wiggles.html).
#### e) Visualize the model learned by RSEM
-RSEM provides an R script, 'rsem-plot-model', for visulazing the model learned.
+RSEM provides an R script, `rsem-plot-model`, for visulazing the model learned.
Usage:
@@ -259,9 +370,9 @@ Histogram of reads with different number of alignments: x-axis is the number of
## <a name="example"></a> Example
Suppose we download the mouse genome from UCSC Genome Browser. We do
-not add poly(A) tails and use '/ref/mouse_0' as the reference name.
-We have a FASTQ-formatted file, 'mmliver.fq', containing single-end
-reads from one sample, which we call 'mmliver_single_quals'. We want
+not add poly(A) tails and use `/ref/mouse_0` as the reference name.
+We have a FASTQ-formatted file, `mmliver.fq`, containing single-end
+reads from one sample, which we call `mmliver_single_quals`. We want
to estimate expression values by using the single-end model with a
fragment length distribution. We know that the fragment length
distribution is approximated by a normal distribution with a mean of
@@ -270,9 +381,9 @@ credibility intervals in addition to maximum likelihood estimates.
RSEM will be allowed 1G of memory for the credibility interval
calculation. We will visualize the probabilistic read mappings
generated by RSEM on UCSC genome browser. We will generate a list of
-genes' transcript wiggle plots in 'output.pdf'. The list is
-'gene_ids.txt'. We will visualize the models learned in
-'mmliver_single_quals.models.pdf'
+genes` transcript wiggle plots in `output.pdf`. The list is
+`gene_ids.txt`. We will visualize the models learned in
+`mmliver_single_quals.models.pdf`
The commands for this scenario are as follows:
@@ -284,7 +395,7 @@ The commands for this scenario are as follows:
## <a name="simulation"></a> Simulation
-RSEM provides users the 'rsem-simulate-reads' program to simulate RNA-Seq data based on parameters learned from real data sets. Run
+RSEM provides users the `rsem-simulate-reads` program to simulate RNA-Seq data based on parameters learned from real data sets. Run
rsem-simulate-reads
@@ -294,15 +405,15 @@ to get usage information or read the following subsections.
rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [-q]
-__reference_name:__ The name of RSEM references, which should be already generated by 'rsem-prepare-reference'
+__reference_name:__ The name of RSEM references, which should be already generated by `rsem-prepare-reference`
-__estimated_model_file:__ This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name o [...]
+__estimated_model_file:__ This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using `rsem-calculate-expression`. The file can be found under the `sample_name.stat` folder with the name o [...]
-__estimated_isoform_results:__ This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is [...]
+__estimated_isoform_results:__ This file contains expression levels for all isoforms recorded in the reference. It can be learned using `rsem-calculate-expression` from real data. The corresponding file users want to use is `sample_name.isoforms.results`. If simulating from user-designed expression profile is desired, start from a learned `sample_name.isoforms.results` file and only modify the `TPM` column. The simulator only reads the TPM column. But keeping the file format the same is [...]
-__theta0:__ This parameter determines the fraction of reads that are coming from background "noise" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'.
+__theta0:__ This parameter determines the fraction of reads that are coming from background "noise" (instead of from a transcript). It can also be estimated using `rsem-calculate-expression` from real data. Users can find it as the first value of the third line of the file `sample_name.stat/sample_name.theta`.
-__N:__ The total number of reads to be simulated. If 'rsem-calculate-expression' is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file 'sample_name.stat/sample_name.cnt'.
+__N:__ The total number of reads to be simulated. If `rsem-calculate-expression` is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file `sample_name.stat/sample_name.cnt`.
__output_name:__ Prefix for all output files.
@@ -331,7 +442,7 @@ __rid:__ Simulated read's index, numbered from 0
__dir:__ The direction of the simulated read. 0 refers to forward strand ('+') and 1 refers to reverse strand ('-')
-__sid:__ Represent which transcript this read is simulated from. It ranges between 0 and M, where M is the total number of transcripts. If sid=0, the read is simulated from the background noise. Otherwise, the read is simulated from a transcript with index sid. Transcript sid's transcript name can be found in the 'transcript_id' column of the 'sample_name.isoforms.results' file (at line sid + 1, line 1 is for column names)
+__sid:__ Represent which transcript this read is simulated from. It ranges between 0 and M, where M is the total number of transcripts. If sid=0, the read is simulated from the background noise. Otherwise, the read is simulated from a transcript with index sid. Transcript sid's transcript name can be found in the `transcript_id` column of the `sample_name.isoforms.results` file (at line sid + 1, line 1 is for column names)
__pos:__ The start position of the simulated read in strand dir of transcript sid. It is numbered from 0
@@ -339,7 +450,7 @@ __insertL:__ Only appear for paired-end reads. It gives the insert length of the
### Example:
-Suppose we want to simulate 50 millon single-end reads with quality scores and use the parameters learned from [Example](#example). In addition, we set theta0 as 0.2 and output_name as 'simulated_reads'. The command is:
+Suppose we want to simulate 50 millon single-end reads with quality scores and use the parameters learned from [Example](#example). In addition, we set theta0 as 0.2 and output_name as `simulated_reads`. The command is:
rsem-simulate-reads /ref/mouse_0 mmliver_single_quals.stat/mmliver_single_quals.model mmliver_single_quals.isoforms.results 0.2 50000000 simulated_reads
@@ -370,7 +481,7 @@ it is more robust to outliers. For more information about EBSeq
website](http://www.biostat.wisc.edu/~ningleng/EBSeq_Package).
-RSEM includes EBSeq in its folder named 'EBSeq'. To use it, first type
+RSEM includes EBSeq in its folder named `EBSeq`. To use it, first type
make ebseq
@@ -379,7 +490,7 @@ to compile the EBSeq related codes.
EBSeq requires gene-isoform relationship for its isoform DE
detection. However, for de novo assembled transcriptome, it is hard to
obtain an accurate gene-isoform relationship. Instead, RSEM provides a
-script 'rsem-generate-ngvector', which clusters transcripts based on
+script `rsem-generate-ngvector`, which clusters transcripts based on
measures directly relating to read mappaing ambiguity. First, it
calcualtes the 'unmappability' of each transcript. The 'unmappability'
of a transcript is the ratio between the number of k mers with at
@@ -398,17 +509,17 @@ documentation
page](rsem-generate-ngvector.html).
If your reference is a de novo assembled transcript set, you should
-run 'rsem-generate-ngvector' first. Then load the resulting
-'output_name.ngvec' into R. For example, you can use
+run `rsem-generate-ngvector` first. Then load the resulting
+`output_name.ngvec` into R. For example, you can use
NgVec <- scan(file="output_name.ngvec", what=0, sep="\n")
. After that, set "NgVector = NgVec" for your differential expression
-test (either 'EBTest' or 'EBMultiTest').
+test (either `EBTest` or `EBMultiTest`).
For users' convenience, RSEM also provides a script
-'rsem-generate-data-matrix' to extract input matrix from expression
+`rsem-generate-data-matrix` to extract input matrix from expression
results:
rsem-generate-data-matrix sampleA.[genes/isoforms].results sampleB.[genes/isoforms].results ... > output_name.counts.matrix
@@ -418,18 +529,18 @@ all isoform level results. You can load the matrix into R by
IsoMat <- data.matrix(read.table(file="output_name.counts.matrix"))
-before running either 'EBTest' or 'EBMultiTest'.
+before running either `EBTest` or `EBMultiTest`.
-Lastly, RSEM provides two scripts, 'rsem-run-ebseq' and
-'rsem-control-fdr', to help users find differential expressed
-genes/transcripts. First, 'rsem-run-ebseq' calls EBSeq to calculate related statistics
+Lastly, RSEM provides two scripts, `rsem-run-ebseq` and
+`rsem-control-fdr`, to help users find differential expressed
+genes/transcripts. First, `rsem-run-ebseq` calls EBSeq to calculate related statistics
for all genes/transcripts. Run
rsem-run-ebseq --help
to get usage information or visit the [rsem-run-ebseq documentation
page](rsem-run-ebseq.html). Second,
-'rsem-control-fdr' takes 'rsem-run-ebseq' 's result and reports called
+`rsem-control-fdr` takes `rsem-run-ebseq` 's result and reports called
differentially expressed genes/transcripts by controlling the false
discovery rate. Run
@@ -440,7 +551,7 @@ page](rsem-control-fdr.html). These
two scripts can perform DE analysis on either 2 conditions or multiple
conditions.
-Please note that 'rsem-run-ebseq' and 'rsem-control-fdr' use EBSeq's
+Please note that `rsem-run-ebseq` and `rsem-control-fdr` use EBSeq's
default parameters. For advanced use of EBSeq or information about how
EBSeq works, please refer to [EBSeq's
manual](http://www.bioconductor.org/packages/devel/bioc/vignettes/EBSeq/inst/doc/EBSeq_Vignette.pdf).
diff --git a/Transcript.h b/Transcript.h
index f7afa2b..5b80854 100644
--- a/Transcript.h
+++ b/Transcript.h
@@ -7,6 +7,7 @@
#include<string>
#include<vector>
#include<fstream>
+#include<sstream>
#include "utils.h"
@@ -30,17 +31,14 @@ public:
structure.clear();
strand = 0;
seqname = gene_id = transcript_id = "";
+ gene_name = transcript_name = "";
left = "";
}
Transcript(const std::string& transcript_id, const std::string& gene_id, const std::string& seqname,
- const char& strand, const std::vector<Interval>& structure, const std::string& left) {
- this->structure = structure;
- this->strand = strand;
- this->seqname = seqname;
- this->gene_id = gene_id;
- this->transcript_id = transcript_id;
-
+ const char& strand, const std::vector<Interval>& structure, const std::string& left,
+ const std::string& transcript_name = "", const std::string& gene_name = "") : structure(structure), strand(strand),
+ seqname(seqname), gene_id(gene_id), transcript_id(transcript_id), gene_name(gene_name), transcript_name(transcript_name) {
//eliminate prefix spaces in string variable "left"
int pos = 0;
int len = left.length();
@@ -58,8 +56,12 @@ public:
const std::string& getTranscriptID() const { return transcript_id; }
+ const std::string& getTranscriptName() const { return transcript_name; }
+
const std::string& getGeneID() const { return gene_id; }
+ const std::string& getGeneName() const { return gene_name; }
+
const std::string& getSeqName() const { return seqname; }
char getStrand() const { return strand; }
@@ -80,6 +82,7 @@ private:
std::vector<Interval> structure; // transcript structure , coordinate starts from 1
char strand;
std::string seqname, gene_id, transcript_id; // follow GTF definition
+ std::string gene_name, transcript_name;
std::string left;
};
@@ -116,10 +119,18 @@ void Transcript::extractSeq(const std::string& gseq, std::string& seq) const {
void Transcript::read(std::ifstream& fin) {
int s;
std::string tmp;
-
- getline(fin, transcript_id);
- getline(fin, gene_id);
+ std::istringstream strin;
+
+ getline(fin, tmp);
+ strin.str(tmp);
+ strin>> transcript_id>> transcript_name;
+
+ getline(fin, tmp);
+ strin.clear(); strin.str(tmp);
+ strin>> gene_id>> gene_name;
+
getline(fin, seqname);
+
fin>>tmp>>length;
assert(tmp.length() == 1 && (tmp[0] == '+' || tmp[0] == '-'));
strand = tmp[0];
@@ -137,8 +148,14 @@ void Transcript::read(std::ifstream& fin) {
void Transcript::write(std::ofstream& fout) {
int s = structure.size();
- fout<<transcript_id<<std::endl;
- fout<<gene_id<<std::endl;
+ fout<< transcript_id;
+ if (transcript_name != "") fout<< '\t'<< transcript_name;
+ fout<< std::endl;
+
+ fout<< gene_id;
+ if (gene_name != "") fout<< '\t'<< gene_name;
+ fout<< std::endl;
+
fout<<seqname<<std::endl;
fout<<strand<<" "<<length<<std::endl;
fout<<s;
diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW
index 8f0ef31..46e652c 100644
--- a/WHAT_IS_NEW
+++ b/WHAT_IS_NEW
@@ -1,3 +1,29 @@
+RSEM v1.2.26
+
+- RSEM supports GFF3 annotation format now
+- Added instructions to build RSEM references using RefSeq, Ensembl, or GENCODE annotations
+- Added an option to extract only transcripts from certain resources given a GTF/GFF3 file
+- Fixed a bug and improved the clarity of error messages for extracting transcripts from genome
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.25
+
+- RSEM will extract gene_name/transcript_name from GTF file when possible; however, it only appends them to the 'sample_name.*.results' files if '--append-names' option is specified; unlike v1.2.24, this version is compatible with STAR aligner even when '--append-names' is set
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.24
+
+- RSEM will extract gene_name/transcript_name from GTF file when possible; if extracted, gene_name/transcript_name will append at the end of gene_id/transcript_id with an underscore in between
+- Modified 'rsem-plot-model' to indicate the modes of fragment length and read length distributions
+- Modified 'rsem-plot-model' to present alignment statistics better using both barplot and pie chart
+- Updated 'EBSeq' to version 1.2.0
+- Added coefficient of quartile variation in addition to credibility intervals when '--calc-ci' is turned on
+- Added '--single-cell-prior' option to notify RSEM to use a sparse prior (Dir(0.1)) for single cell data; this option only makes sense if '--calc-pme' or '--calc-ci' is set
+
+--------------------------------------------------------------------------------------------
+
RSEM v1.2.23
- Moved version information from WHAT_IS_NEW to rsem_perl_utils.pm in order to make sure the '--version' option always output the version information
diff --git a/WriteResults.h b/WriteResults.h
index 79ba5b2..889d4c2 100644
--- a/WriteResults.h
+++ b/WriteResults.h
@@ -120,7 +120,7 @@ inline bool isAlleleSpecific(const char* refName, GroupInfo* gt = NULL, GroupInf
return alleleS;
}
-void writeResultsEM(int M, const char* refName, const char* imdName, Transcripts& transcripts, std::vector<double>& theta, std::vector<double>& eel, double* counts) {
+void writeResultsEM(int M, const char* refName, const char* imdName, Transcripts& transcripts, std::vector<double>& theta, std::vector<double>& eel, double* counts, bool appendNames) {
char outF[STRLEN];
FILE *fo;
@@ -131,7 +131,7 @@ void writeResultsEM(int M, const char* refName, const char* imdName, Transcripts
std::vector<int> tlens;
std::vector<double> fpkm, tpm, isopct;
std::vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
-
+
// Load group info
sprintf(groupF, "%s.grp", refName);
gi.load(groupF);
@@ -224,11 +224,19 @@ void writeResultsEM(int M, const char* refName, const char* imdName, Transcripts
fo = fopen(outF, "w");
for (int i = 1; i <= M; i++) {
const Transcript& transcript = transcripts.getTranscriptAt(i);
- fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n'));
+
+ fprintf(fo, "%s", transcript.getTranscriptID().c_str());
+ if (appendNames && transcript.getTranscriptName() != "")
+ fprintf(fo, "_%s", transcript.getTranscriptName().c_str());
+ fprintf(fo, "%c", (i < M ? '\t' : '\n'));
}
for (int i = 1; i <= M; i++) {
const Transcript& transcript = transcripts.getTranscriptAt(i);
- fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n'));
+
+ fprintf(fo, "%s", transcript.getGeneID().c_str());
+ if (appendNames && transcript.getGeneName() != "")
+ fprintf(fo, "_%s", transcript.getGeneName().c_str());
+ fprintf(fo, "%c", (i < M ? '\t' : '\n'));
}
for (int i = 1; i <= M; i++)
fprintf(fo, "%d%c", tlens[i], (i < M ? '\t' : '\n'));
@@ -307,16 +315,23 @@ void writeResultsEM(int M, const char* refName, const char* imdName, Transcripts
fo = fopen(outF, "w");
for (int i = 0; i < m; i++) {
const Transcript& transcript = transcripts.getTranscriptAt(gi.spAt(i));
- fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < m - 1 ? '\t' : '\n'));
+
+ fprintf(fo, "%s", transcript.getGeneID().c_str());
+ if (appendNames && transcript.getGeneName() != "")
+ fprintf(fo, "_%s", transcript.getGeneName().c_str());
+ fprintf(fo, "%c", (i < m - 1 ? '\t' : '\n'));
}
for (int i = 0; i < m; i++) {
int b = gi.spAt(i), e = gi.spAt(i + 1);
std::string curtid = "", tid;
for (int j = b; j < e; j++) {
- tid = transcripts.getTranscriptAt(j).getTranscriptID();
+ const Transcript& transcript = transcripts.getTranscriptAt(j);
+ tid = transcript.getTranscriptID();
if (curtid != tid) {
if (curtid != "") fprintf(fo, ",");
fprintf(fo, "%s", tid.c_str());
+ if (appendNames && transcript.getTranscriptName() != "")
+ fprintf(fo, "_%s", transcript.getTranscriptName().c_str());
curtid = tid;
}
}
diff --git a/calcCI.cpp b/calcCI.cpp
index 7cb58ff..088f2eb 100644
--- a/calcCI.cpp
+++ b/calcCI.cpp
@@ -39,13 +39,16 @@ struct CIParams {
};
struct CIType {
- float lb, ub; // the interval is [lb, ub]
-
- CIType() { lb = ub = 0.0; }
+ float lb, ub; // the interval is [lb, ub]
+ float cqv; // coefficient of quartile variation
+
+ CIType() { lb = ub = cqv = 0.0; }
};
int model_type;
+double pseudoC; // pseudo count, default is 1
+
int nMB;
double confidence;
int nCV, nSpC, nSamples; // nCV: number of count vectors; nSpC: number of theta vectors sampled per count vector; nSamples: nCV * nSpC
@@ -106,14 +109,14 @@ void* sample_theta_from_c(void* arg) {
int cnt = 0;
while (fscanf(fi, "%d", &cvec[0]) == 1) {
for (int j = 1; j <= M; j++) assert(fscanf(fi, "%d", &cvec[j]) == 1);
- assert(cvec[0] > 0);
+ assert(cvec[0] >= 0);
++cnt;
for (int j = 0; j <= M; j++) {
gammas[j] = NULL; rgs[j] = NULL;
- if (cvec[j] > 0) {
- gammas[j] = new gamma_dist(cvec[j]);
+ if (cvec[j] >= 0) {
+ gammas[j] = new gamma_dist(cvec[j] + pseudoC);
rgs[j] = new gamma_generator(*(params->engine), *gammas[j]);
}
}
@@ -121,7 +124,7 @@ void* sample_theta_from_c(void* arg) {
for (int i = 0; i < nSpC; i++) {
double sum = 0.0;
for (int j = 0; j <= M; j++) {
- theta[j] = ((j == 0 || (cvec[j] > 0 && eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0);
+ theta[j] = ((j == 0 || (cvec[j] >= 0 && eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0);
sum += theta[j];
}
assert(sum >= EPSILON);
@@ -210,14 +213,16 @@ void sample_theta_vectors_from_count_vectors() {
if (verbose) { printf("Sampling is finished!\n"); }
}
-void calcCI(int nSamples, float *samples, float &lb, float &ub) {
+void calcCI(int nSamples, float *samples, CIType& ci) {
int p, q; // p pointer for lb, q pointer for ub;
int newp, newq;
int threshold = nSamples - (int(confidence * nSamples - 1e-8) + 1);
int nOutside = 0;
+ // sort values
sort(samples, samples + nSamples);
+ // calculate credibility interval
p = 0; q = nSamples - 1;
newq = nSamples - 1;
do {
@@ -228,11 +233,11 @@ void calcCI(int nSamples, float *samples, float &lb, float &ub) {
nOutside = nSamples - (q + 1);
- lb = -1e30; ub = 1e30;
+ ci.lb = -1e30; ci.ub = 1e30;
do {
- if (samples[q] - samples[p] < ub - lb) {
- lb = samples[p];
- ub = samples[q];
+ if (samples[q] - samples[p] < ci.ub - ci.lb) {
+ ci.lb = samples[p];
+ ci.ub = samples[q];
}
newp = p;
@@ -251,6 +256,29 @@ void calcCI(int nSamples, float *samples, float &lb, float &ub) {
}
else p = newp;
} while (p <= threshold);
+
+
+ // calculate coefficient of quartile variation
+ float Q1, Q3; // the first and third quartiles
+
+ // calculate Tukey's hinges
+ int quotient = nSamples / 4;
+ int residue = nSamples % 4;
+
+ if (residue == 0) {
+ Q1 = (samples[quotient - 1] + samples[quotient]) / 2.0;
+ Q3 = (samples[3 * quotient - 1] + samples[3 * quotient]) / 2.0;
+ }
+ else if (residue == 3) {
+ Q1 = (samples[quotient] + samples[quotient + 1]) / 2.0;
+ Q3 = (samples[quotient * 3 + 1] + samples[quotient * 3 + 2]) / 2.0;
+ }
+ else {
+ Q1 = samples[quotient];
+ Q3 = samples[3 * quotient];
+ }
+
+ ci.cqv = (Q3 - Q1 > 0.0 ? (Q3 - Q1) / (Q3 + Q1) : 0.0);
}
void* calcCI_batch(void* arg) {
@@ -290,8 +318,8 @@ void* calcCI_batch(void* arg) {
if (curtid != tid) {
if (curtid >= 0) {
if (j - curaid > 1) {
- calcCI(nSamples, itsamples, iso_tpm[curtid].lb, iso_tpm[curtid].ub);
- calcCI(nSamples, ifsamples, iso_fpkm[curtid].lb, iso_fpkm[curtid].ub);
+ calcCI(nSamples, itsamples, iso_tpm[curtid]);
+ calcCI(nSamples, ifsamples, iso_fpkm[curtid]);
}
else {
iso_tpm[curtid] = tpm[curaid];
@@ -313,13 +341,13 @@ void* calcCI_batch(void* arg) {
gtsamples[k] += tsamples[k];
gfsamples[k] += fsamples[k];
}
- calcCI(nSamples, tsamples, tpm[j].lb, tpm[j].ub);
- calcCI(nSamples, fsamples, fpkm[j].lb, fpkm[j].ub);
+ calcCI(nSamples, tsamples, tpm[j]);
+ calcCI(nSamples, fsamples, fpkm[j]);
}
if (e - b > 1) {
- calcCI(nSamples, gtsamples, gene_tpm[i].lb, gene_tpm[i].ub);
- calcCI(nSamples, gfsamples, gene_fpkm[i].lb, gene_fpkm[i].ub);
+ calcCI(nSamples, gtsamples, gene_tpm[i]);
+ calcCI(nSamples, gfsamples, gene_fpkm[i]);
}
else {
gene_tpm[i] = tpm[b];
@@ -333,8 +361,8 @@ void* calcCI_batch(void* arg) {
if (alleleS && (curtid >= 0)) {
if (gi.spAt(ciParams->end_gene_id) - curaid > 1) {
- calcCI(nSamples, itsamples, iso_tpm[curtid].lb, iso_tpm[curtid].ub);
- calcCI(nSamples, ifsamples, iso_fpkm[curtid].lb, iso_fpkm[curtid].ub);
+ calcCI(nSamples, itsamples, iso_tpm[curtid]);
+ calcCI(nSamples, ifsamples, iso_fpkm[curtid]);
}
else {
iso_tpm[curtid] = tpm[curaid];
@@ -420,9 +448,13 @@ void calculate_credibility_intervals(char* imdName) {
for (int i = 1; i <= M; i++)
fprintf(fo, "%.6g%c", tpm[i].ub, (i < M ? '\t' : '\n'));
for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.6g%c", tpm[i].cqv, (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
fprintf(fo, "%.6g%c", fpkm[i].lb, (i < M ? '\t' : '\n'));
for (int i = 1; i <= M; i++)
fprintf(fo, "%.6g%c", fpkm[i].ub, (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.6g%c", fpkm[i].cqv, (i < M ? '\t' : '\n'));
fclose(fo);
if (alleleS) {
@@ -434,9 +466,13 @@ void calculate_credibility_intervals(char* imdName) {
for (int i = 0; i < m_trans; i++)
fprintf(fo, "%.6g%c", iso_tpm[i].ub, (i < m_trans - 1 ? '\t' : '\n'));
for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.6g%c", iso_tpm[i].cqv, (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
fprintf(fo, "%.6g%c", iso_fpkm[i].lb, (i < m_trans - 1 ? '\t' : '\n'));
for (int i = 0; i < m_trans; i++)
fprintf(fo, "%.6g%c", iso_fpkm[i].ub, (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.6g%c", iso_fpkm[i].cqv, (i < m_trans - 1 ? '\t' : '\n'));
fclose(fo);
}
@@ -444,13 +480,17 @@ void calculate_credibility_intervals(char* imdName) {
sprintf(outF, "%s.gene_res", imdName);
fo = fopen(outF, "a");
for (int i = 0; i < m; i++)
- fprintf(fo, "%.6g%c", gene_tpm[i].lb, (i < m - 1 ? '\t' : '\n'));
+ fprintf(fo, "%.6g%c", gene_tpm[i].lb, (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.6g%c", gene_tpm[i].ub, (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.6g%c", gene_tpm[i].cqv, (i < m - 1 ? '\t' : '\n'));
for (int i = 0; i < m; i++)
- fprintf(fo, "%.6g%c", gene_tpm[i].ub, (i < m - 1 ? '\t' : '\n'));
+ fprintf(fo, "%.6g%c", gene_fpkm[i].lb, (i < m - 1 ? '\t' : '\n'));
for (int i = 0; i < m; i++)
- fprintf(fo, "%.6g%c", gene_fpkm[i].lb, (i < m - 1 ? '\t' : '\n'));
+ fprintf(fo, "%.6g%c", gene_fpkm[i].ub, (i < m - 1 ? '\t' : '\n'));
for (int i = 0; i < m; i++)
- fprintf(fo, "%.6g%c", gene_fpkm[i].ub, (i < m - 1 ? '\t' : '\n'));
+ fprintf(fo, "%.6g%c", gene_fpkm[i].cqv, (i < m - 1 ? '\t' : '\n'));
fclose(fo);
delete[] tpm;
@@ -467,7 +507,7 @@ void calculate_credibility_intervals(char* imdName) {
int main(int argc, char* argv[]) {
if (argc < 8) {
- printf("Usage: rsem-calculate-credibility-intervals reference_name imdName statName confidence nCV nSpC nMB [-p #Threads] [--seed seed] [-q]\n");
+ printf("Usage: rsem-calculate-credibility-intervals reference_name imdName statName confidence nCV nSpC nMB [-p #Threads] [--seed seed] [--pseudo-count pseudo_count] [-q]\n");
exit(-1);
}
@@ -483,6 +523,7 @@ int main(int argc, char* argv[]) {
nThreads = 1;
quiet = false;
hasSeed = false;
+ pseudoC = 1.0;
for (int i = 8; i < argc; i++) {
if (!strcmp(argv[i], "-p")) nThreads = atoi(argv[i + 1]);
if (!strcmp(argv[i], "--seed")) {
@@ -491,6 +532,7 @@ int main(int argc, char* argv[]) {
seed = 0;
for (int k = 0; k < len; k++) seed = seed * 10 + (argv[i + 1][k] - '0');
}
+ if (!strcmp(argv[i], "--pseudo-count")) pseudoC = atof(argv[i + 1]);
if (!strcmp(argv[i], "-q")) quiet = true;
}
verbose = !quiet;
diff --git a/extractRef.cpp b/extractRef.cpp
index 68854f1..f866b5a 100644
--- a/extractRef.cpp
+++ b/extractRef.cpp
@@ -4,9 +4,11 @@
#include<cstdlib>
#include<fstream>
#include<sstream>
+#include<set>
#include<map>
#include<vector>
#include<algorithm>
+#include<string>
#include "utils.h"
#include "my_assert.h"
@@ -50,6 +52,20 @@ char mappingFile[STRLEN];
map<string, string> mi_table; // mapping info table
map<string, string>::iterator mi_iter; //mapping info table's iterator
+set<string> sources;
+
+void parseSources(char* sstr) {
+ char* p = strtok(sstr, ",");
+ while (p != NULL) {
+ sources.insert(p);
+ p = strtok(NULL, ",");
+ }
+}
+
+inline bool isTrusted(const string& source) {
+ return sources.size() == 0 || sources.find(source) != sources.end();
+}
+
void loadMappingInfo(char* mappingF) {
ifstream fin(mappingF);
string line, key, value;
@@ -74,6 +90,8 @@ bool buildTranscript(int sp, int ep) {
string transcript_id = items[sp].getTranscriptID();
string gene_id = items[sp].getGeneID();
+ string gene_name = "", transcript_name = "";
+
char strand = items[sp].getStrand();
string seqname = items[sp].getSeqName();
string left = items[sp].getLeft();
@@ -84,8 +102,17 @@ bool buildTranscript(int sp, int ep) {
int start = items[i].getStart();
int end = items[i].getEnd();
- general_assert(strand == items[i].getStrand(), "According to the GTF file given, a transcript has exons from different orientations!");
- general_assert(seqname == items[i].getSeqName(), "According to the GTF file given, a transcript has exons on multiple chromosomes!");
+ general_assert(strand == items[i].getStrand(), "According to the GTF file given, transcript " + transcript_id + " has exons from different orientations!");
+ general_assert(seqname == items[i].getSeqName(), "According to the GTF file given, transcript " + transcript_id + " has exons on multiple chromosomes!");
+
+ if (items[i].getGeneName() != "") {
+ if (gene_name == "") gene_name = items[i].getGeneName();
+ else general_assert(gene_name == items[i].getGeneName(), "Transcript " + transcript_id + " is associated with multiple gene names!");
+ }
+ if (items[i].getTranscriptName() != "") {
+ if (transcript_name == "") transcript_name = items[i].getTranscriptName();
+ else general_assert(transcript_name == items[i].getTranscriptName(), "Transcript " + transcript_id + " is associated with multiple transcript names!");
+ }
if (cur_e + 1 < start) {
if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));
@@ -95,7 +122,7 @@ bool buildTranscript(int sp, int ep) {
}
if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));
- transcripts.add(Transcript(transcript_id, gene_id, seqname, strand, vec, left));
+ transcripts.add(Transcript(transcript_id, gene_id, seqname, strand, vec, left, transcript_name, gene_name));
return true;
}
@@ -113,8 +140,7 @@ void parse_gtf_file(char* gtfF) {
while (getline(fin, line)) {
if (line[0] == '#') continue; // if this line is comment, jump it
item.parse(line);
- string feature = item.getFeature();
- if (feature == "exon") {
+ if (item.getFeature() == "exon" && isTrusted(item.getSource())) {
if (item.getStart() > item.getEnd()) {
printf("Warning: exon's start position is larger than its end position! This exon is discarded.\n");
printf("\t%s\n\n", line.c_str());
@@ -140,7 +166,7 @@ void parse_gtf_file(char* gtfF) {
if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); }
}
fin.close();
-
+
sort(items.begin(), items.end());
int sp = 0, ep; // start pointer, end pointer
@@ -256,17 +282,21 @@ void writeResults(char* refName) {
}
int main(int argc, char* argv[]) {
- if (argc < 6 || ((hasMappingFile = atoi(argv[4])) && argc < 7)) {
- printf("Usage: rsem-extract-reference-transcripts refName quiet gtfF hasMappingFile [mappingFile] chromosome_file_1 [chromosome_file_2 ...]\n");
+ if (argc < 7 || ((hasMappingFile = atoi(argv[5])) && argc < 8)) {
+ printf("Usage: rsem-extract-reference-transcripts refName quiet gtfF sources hasMappingFile [mappingFile] chromosome_file_1 [chromosome_file_2 ...]\n");
exit(-1);
}
verbose = !atoi(argv[2]);
if (hasMappingFile) {
- loadMappingInfo(argv[5]);
+ loadMappingInfo(argv[6]);
}
- parse_gtf_file(argv[3]);
+ sources.clear();
+ if (strcmp(argv[4], "None")) parseSources(argv[4]);
+
+ parse_gtf_file(argv[3]);
+
ifstream fin;
string line, gseq, seqname;
@@ -274,7 +304,7 @@ int main(int argc, char* argv[]) {
seqs.clear();
seqs.resize(M + 1, "");
- int start = hasMappingFile ? 6 : 5;
+ int start = hasMappingFile ? 7 : 6;
for (int i = start; i < argc; i++) {
fin.open(argv[i]);
general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist.");
diff --git a/rsem-calculate-expression b/rsem-calculate-expression
index c4dc32f..4e1e21c 100755
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -57,6 +57,7 @@ my $genGenomeBamF = 0;
my $sampling = 0;
my $calcPME = 0;
my $calcCI = 0;
+my $single_cell_prior = 0;
my $quiet = 0;
my $help = 0;
@@ -74,6 +75,8 @@ my $bowtie2_sensitivity_level = "sensitive"; # must be one of "very_fast", "fast
my $seed = "NULL";
+my $appendNames = 0;
+
my $version = 0;
my $mTime = 0;
@@ -127,9 +130,11 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
"estimate-rspd" => \$estRSPD,
"num-rspd-bins=i" => \$B,
"p|num-threads=i" => \$nThreads,
+ "append-names" => \$appendNames,
"no-bam-output" => sub { $genBamF = 0; },
"output-genome-bam" => \$genGenomeBamF,
"sampling-for-bam" => \$sampling,
+ "single-cell-prior" => \$single_cell_prior,
"calc-pme" => \$calcPME,
"gibbs-burnin=i" => \$BURNIN,
"gibbs-number-of-samples=i" => \$NCV,
@@ -464,6 +469,7 @@ if ($genBamF) {
if ($seed ne "NULL") { $command .= " --seed $seeds[0]"; }
}
if ($calcPME || $calcCI) { $command .= " --gibbs-out"; }
+if ($appendNames) { $command .= " --append-names"; }
if ($quiet) { $command .= " -q"; }
&runCommand($command);
@@ -502,6 +508,7 @@ if ($calcPME || $calcCI ) {
$command = "rsem-run-gibbs $refName $imdName $statName $BURNIN $NCV $SAMPLEGAP";
$command .= " -p $nThreads";
if ($seed ne "NULL") { $command .= " --seed $seeds[1]"; }
+ if ($single_cell_prior) { $command .= " --pseudo-count 0.1"; }
if ($quiet) { $command .= " -q"; }
&runCommand($command);
}
@@ -527,6 +534,7 @@ if ($calcCI) {
$command = "rsem-calculate-credibility-intervals $refName $imdName $statName $CONFIDENCE $NCV $NSPC $NMB";
$command .= " -p $nThreads";
if ($seed ne "NULL") { $command .= " --seed $seeds[2]"; }
+ if ($single_cell_prior) { $command .= " --pseudo-count 0.1"; }
if ($quiet) { $command .= " -q"; }
&runCommand($command);
@@ -644,6 +652,10 @@ Input file is in BAM format. (Default: off)
Number of threads to use. Both Bowtie/Bowtie2, expression estimation and 'samtools sort' will use this many threads. (Default: 1)
+=item B<--append-names>
+
+If gene_name/transcript_name is available, append it to the end of gene_id/transcript_id (separated by '_') in files 'sample_name.isoforms.results' and 'sample_name.genes.results'. (Default: off)
+
=item B<--no-bam-output>
Do not output any BAM file. (Default: off)
@@ -660,6 +672,10 @@ When RSEM generates a BAM file, instead of outputing all alignments a read has w
Set the seed for the random number generators used in calculating posterior mean estimates and credibility intervals. The seed must be a non-negative 32 bit interger. (Default: off)
+=item B<--single-cell-prior>
+
+By default, RSEM uses Dirichlet(1) as the prior to calculate posterior mean estimates and credibility intervals. However, much less genes are expressed in single cell RNA-Seq data. Thus, if you want to compute posterior mean estimates and/or credibility intervals and you have single-cell RNA-Seq data, you are recommended to turn on this option. Then RSEM will use Dirichlet(0.1) as the prior which encourage the sparsity of the expression levels. (Default: off)
+
=item B<--calc-pme>
Run RSEM's collapsed Gibbs sampler to calculate posterior mean estimates. (Default: off)
@@ -812,7 +828,7 @@ The credibility level for credibility intervals. (Default: 0.95)
=item B<--ci-memory> <int>
-Maximum size (in memory, MB) of the auxiliary buffer used for computing credibility intervals (CI). Set it larger for a faster CI calculation. However, leaving 2 GB memory free for other usage is recommended. (Default: 1024)
+Maximum size (in memory, MB) of the auxiliary buffer used for computing credibility intervals (CI). (Default: 1024)
=item B<--ci-number-of-samples-per-count-vector> <int>
@@ -870,7 +886,7 @@ File containing isoform level expression estimates. The first line
contains column names separated by the tab character. The format of
each line in the rest of this file is:
-transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [posterior_mean_count posterior_standard_deviation_of_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
+transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [posterior_mean_count posterior_standard_deviation_of_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
Fields are separated by the tab character. Fields within "[]" are
optional. They will not be presented if neither '--calc-pme' nor
@@ -925,7 +941,17 @@ percentage calculated from 'pme_TPM' values.
'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' and
'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95%
credibility intervals for TPM and FPKM values. The bounds are
-inclusive (i.e. [l, u]).
+inclusive (i.e. [l, u]).
+
+'TPM_coefficient_of_quartile_variation' and
+'FPKM_coefficient_of_quartile_variation' are coefficients of quartile
+variation (CQV) for TPM and FPKM values. CQV is a robust way of
+measuring the ratio between the standard deviation and the mean. It is
+defined as
+
+CQV := (Q3 - Q1) / (Q3 + Q1),
+
+where Q1 and Q3 are the first and third quartiles.
=item B<sample_name.genes.results>
@@ -933,7 +959,7 @@ File containing gene level expression estimates. The first line
contains column names separated by the tab character. The format of
each line in the rest of this file is:
-gene_id transcript_id(s) length effective_length expected_count TPM FPKM [posterior_mean_count posterior_standard_deviation_of_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
+gene_id transcript_id(s) length effective_length expected_count TPM FPKM [posterior_mean_count posterior_standard_deviation_of_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
Fields are separated by the tab character. Fields within "[]" are
optional. They will not be presented if neither '--calc-pme' nor
@@ -958,7 +984,7 @@ allele-specific expression calculation. The first line
contains column names separated by the tab character. The format of
each line in the rest of this file is:
-allele_id transcript_id gene_id length effective_length expected_count TPM FPKM AlleleIsoPct AlleleGenePct [posterior_mean_count posterior_standard_deviation_of_count pme_TPM pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
+allele_id transcript_id gene_id length effective_length expected_count TPM FPKM AlleleIsoPct AlleleGenePct [posterior_mean_count posterior_standard_deviation_of_count pme_TPM pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
Fields are separated by the tab character. Fields within "[]" are
optional. They will not be presented if neither '--calc-pme' nor
diff --git a/rsem-gen-transcript-plots b/rsem-gen-transcript-plots
index 5b8cc98..91df2f7 100755
--- a/rsem-gen-transcript-plots
+++ b/rsem-gen-transcript-plots
@@ -43,15 +43,22 @@ build_map <- function(filename) {
map
}
+canonical_id = function(id) {
+ pos = regexpr("_", id, fixed = T)
+ ifelse(pos > 0, substr(id, 1, pos - 1), id)
+}
+
make_a_plot <- function(id) {
- vec <- readdepth[[id]]
+ cid = canonical_id(id)
+
+ vec <- readdepth[[cid]]
if (is.null(vec)) exit_with_error(sprintf("Unknown %s detected, %s is not included in RSEM's indices.", ifelse(alleleS, "allele-specific transcript", "transcript"), id))
if (is.na(vec[[2]])) wiggle <- rep(0, vec[[1]]) else wiggle <- as.numeric(unlist(strsplit(vec[[2]], split = " ")))
len <- length(wiggle)
if (!show_uniq) {
plot(wiggle, type = "h")
} else {
- vec <- readdepth_uniq[[id]]
+ vec <- readdepth_uniq[[cid]]
stopifnot(!is.null(vec))
if (is.na(vec[[2]])) wiggle_uniq <- rep(0, vec[[1]]) else wiggle_uniq <- as.numeric(unlist(strsplit(vec[[2]], split = " ")))
stopifnot(len == length(wiggle_uniq))
diff --git a/rsem-gff3-to-gtf b/rsem-gff3-to-gtf
new file mode 100755
index 0000000..c43e9e2
--- /dev/null
+++ b/rsem-gff3-to-gtf
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import argparse
+import re
+
+class HelpOnErrorParser(argparse.ArgumentParser):
+ def error(self, msg):
+ sys.stderr.write("{}: error: {}\n\n".format(os.path.basename(sys.argv[0]), msg))
+ self.print_help()
+ sys.exit(-1)
+
+def my_assert(bool, msg):
+ if not bool:
+ sys.stderr.write(msg + "\n")
+ sys.exit(-1)
+
+def findTag(tag, attributes):
+ pos = attributes.find(tag + "=")
+ if pos < 0:
+ return None
+ pos += len(tag) + 1
+ rpos = attributes.find(";", pos)
+ if rpos < 0:
+ rpos = len(attributes)
+ return attributes[pos:rpos]
+
+parser = HelpOnErrorParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, description = "Convert GFF3 files to GTF files.")
+parser.add_argument("input_GFF3_file", help = "Input GFF3 file.")
+parser.add_argument("output_GTF_file", help = "Output GTF file.")
+parser.add_argument("--RNA-patterns", help = "Types of RNAs to be extracted.", default = "mRNA")
+
+args = parser.parse_args()
+
+trans2gene = {} # tid -> gid
+gid2name = {} # gid -> gene name
+tid2name = {} # tid -> transcript name
+
+rgx = re.compile("[\t]+")
+rgx2 = re.compile("^(" + "|".join(args.RNA_patterns.split(",")) + ")$")
+
+fin = open(args.input_GFF3_file, "r")
+fout = open(args.output_GTF_file, "w")
+
+line_no = 0
+
+for line in fin:
+ line = line.rstrip("\r\n") # remove return characters
+ line_no += 1
+
+ if line.startswith("##FASTA"):
+ break
+ if line.startswith("#"):
+ if line.startswith("###"):
+ # clear all dictionaries
+ trans2gene = {}
+ gid2name = {}
+ tid2name = {}
+ continue
+
+ arr = rgx.split(line)
+ my_assert(len(arr) == 9, "Line {} does not have 9 fields:\n{}".format(line_no, line))
+
+ if arr[2] == "exon":
+ parent = findTag("Parent", arr[8])
+ my_assert(parent != None, "Line {} does not have a Parent tag:\n{}".format(line_no, line))
+
+ tids = parent.split(",")
+ for tid in tids:
+ if tid not in trans2gene:
+ continue
+ gid = trans2gene[tid]
+ fout.write("{}\tgene_id \"{}\"; transcript_id \"{}\";".format("\t".join(arr[:-1]), gid, tid))
+ name = gid2name.get(gid, None)
+ if name != None:
+ fout.write(" gene_name \"{}\";".format(name))
+ name = tid2name.get(tid, None)
+ if name != None:
+ fout.write(" transcript_name \"{}\";".format(name))
+ fout.write("\n")
+
+ elif arr[2] == "gene":
+ gid = findTag("ID", arr[8])
+ name = findTag("Name", arr[8])
+
+ my_assert(gid != None, "Line {} does not have a ID tag:\n{}".format(line_no, line))
+
+ if name != None:
+ gid2name[gid] = name
+
+ elif rgx2.search(arr[2]) != None:
+ tid = findTag("ID", arr[8])
+ gid = findTag("Parent", arr[8])
+ name = findTag("Name", arr[8])
+
+ my_assert(tid != None, "Line {} does not have a ID tag:\n{}".format(line_no, line))
+ my_assert(gid != None, "Line {} does not have a Parent tag:\n{}".format(line_no, line))
+
+ trans2gene[tid] = gid
+ if name != None:
+ tid2name[tid] = name
+
+fin.close()
+fout.close()
diff --git a/rsem-plot-model b/rsem-plot-model
index e87226d..3908e22 100755
--- a/rsem-plot-model
+++ b/rsem-plot-model
@@ -12,7 +12,6 @@ token <- strvec[length(strvec)]
stat.dir <- paste(argv[1], ".stat", sep = "")
if (!file.exists(stat.dir)) {
cat("Error: directory does not exist: ", stat.dir, "\n", sep = "")
- cat(strwrap("This version of rsem-plot-model only works with the output of RSEM versions >= 1.1.8"), sep="\n")
q(status = 1)
}
modelF <- paste(stat.dir, "/", token, ".model", sep = "")
@@ -31,13 +30,15 @@ vec <- as.numeric(strsplit(strvec[1], split = " ")[[1]])
maxL <- vec[2] # maxL used for Profile
x <- (vec[1] + 1) : vec[2]
y <- as.numeric(strsplit(strvec[2], split = " ")[[1]])
+mode_len = which(y == max(y)) + vec[1]
mean <- weighted.mean(x, y)
std <- sqrt(weighted.mean((x - mean)^2, y))
plot(x, y, type = "h",
main = "Fragment Length Distribution",
- sub = paste("Mean = ", round(mean, 1), ", Std = ", round(std, 1)),
+ sub = sprintf("Mode = %d, Mean = %.1f, and Std = %.1f", mode_len, mean, std),
xlab = "Fragment Length",
ylab = "Probability")
+abline(v = mode_len, col = "red", lty = "dashed")
# mate length distribution
if (model_type == 0 || model_type == 1) bval <- as.numeric(readLines(con, n = 1)[1]) else bval <- 1
@@ -48,11 +49,12 @@ if (bval == 1) {
maxL <- vec[2]
x <- (vec[1] + 1) : vec[2]
y <- as.numeric(list[[2]])
+ mode_len = which(y == max(y)) + vec[1]
mean <- weighted.mean(x, y)
std <- sqrt(weighted.mean((x - mean)^2, y))
plot(x, y, type = "h",
main = "Read Length Distribution",
- sub=paste("Mean = ", round(mean, 1), ", Std = ", round(std, 1)),
+ sub = sprintf("Mode = %d, Mean = %.1f, and Std = %.1f", mode_len, mean, std),
xlab = "Read Length",
ylab = "Probability")
}
@@ -140,10 +142,28 @@ if (model_type == 1 || model_type == 3) {
close(con)
+# Alignment statistics
pair <- read.table(file = cntF, skip = 3, sep = "\t")
-barplot(pair[,2], names.arg = pair[,1],
- xlab = "Alignments per fragment",
- ylab = "Number of fragments",
- main = "Histogram of Alignments per Fragment")
+
+stat_len = dim(pair)[1]
+upper_bound = pair[stat_len - 1, 1]
+my_labels = append(0:upper_bound, pair[stat_len, 1])
+my_heights = rep(0, upper_bound + 2)
+dummy = sapply(1:(stat_len - 1), function(id) { my_heights[pair[id, 1] + 1] <<- pair[id, 2] })
+my_heights[upper_bound + 2] = pair[stat_len, 2]
+my_colors = c("green", "blue", rep("dimgrey", upper_bound - 1), "red")
+
+barplot(my_heights, names.arg = my_labels,
+ col = my_colors, border = NA,
+ xlab = "Number of alignments per read",
+ ylab = "Number of reads",
+ main = "Alignment statistics")
+
+pie_values = c(my_heights[1], my_heights[2], sum(my_heights[3:(upper_bound + 1)]), my_heights[upper_bound + 2])
+pie_names = c("Unalignable", "Unique", "Multi", "Filtered")
+pie_labels = sprintf("%s %.0f%%", pie_names, pie_values * 100.0 / sum(pie_values))
+par(fig = c(0.4, 1, 0.35, 0.95), new = T)
+pie(pie_values, labels = pie_labels, col = c("green", "blue", "dimgrey", "red"), clockwise = T, init.angle = 270, cex = 0.8)
+par(fig = c(0, 1, 0, 1))
dev.off.output <- dev.off()
diff --git a/rsem-prepare-reference b/rsem-prepare-reference
index ab0ebce..f42fa8c 100755
--- a/rsem-prepare-reference
+++ b/rsem-prepare-reference
@@ -16,6 +16,9 @@ use warnings;
my $status;
my $gtfF = "";
+my $gff3F = "";
+my $gff3_RNA_patterns = "";
+my $gtf_sources = "None";
my $mappingF = "";
my $polyAChoice = 1; # 0, --polyA, add polyA tails for all isoforms; 1, default, no polyA tails; 2, --no-polyA-subset
my $polyA = 0; # for option --polyA, default off
@@ -36,6 +39,9 @@ my $star_nthreads = 1;
my $star_sjdboverhang = 100;
GetOptions("gtf=s" => \$gtfF,
+ "gff3=s" => \$gff3F,
+ "gff3-RNA-patterns=s" => \$gff3_RNA_patterns,
+ "trusted-sources=s" => \$gtf_sources,
"transcript-to-gene-map=s" => \$mappingF,
"allele-to-gene-map=s" => \$alleleMappingF,
"polyA" => \$polyA,
@@ -54,16 +60,12 @@ GetOptions("gtf=s" => \$gtfF,
pod2usage(-verbose => 2) if ($help == 1);
pod2usage(-msg => "--transcript-to-gene-map and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if (($mappingF ne "") && ($alleleMappingF ne ""));
-pod2usage(-msg => "--gtf and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if (($gtfF ne "") && ($alleleMappingF ne ""));
+pod2usage(-msg => "--gtf and --gff3 are mutually exclusive!", -exitval => 2, -verbose => 2) if (($gtfF ne "") && ($gff3F ne ""));
+pod2usage(-msg => "--gtf/--gff3 and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if ((($gtfF ne "") || ($gff3F ne "")) && ($alleleMappingF ne ""));
pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
if (!$bowtie && ($bowtie_path ne "")) { print "Warning: If Bowtie is not used, no need to set --bowtie-path option!\n"; }
if (!$bowtie2 && ($bowtie2_path ne "")) { print "Warning: If Bowtie 2 is not used, no need to set --bowtie2-path option!\n"; }
-
-my $type;
-
-if ($gtfF ne "") { $type = 0; }
-else { $type = 1; }
my @list = split(/,/, $ARGV[0]);
my $size = scalar(@list);
@@ -86,9 +88,21 @@ if ($star_path ne "") { $star_path .= "/"; }
my $command = "";
-if ($type == 0) {
+if ($gff3F ne "") {
+ $gtfF = "$ARGV[1].gtf";
+ pod2usage(-msg => "A file with the name $gtfF alreay exists! GFF3-to-GTF conversion failed!", -exitval => 2, -verbose => 2) if (-e $gtfF);
+ $command = "rsem-gff3-to-gtf";
+ if ($gff3_RNA_patterns ne "") {
+ $command .= " --RNA-patterns $gff3_RNA_patterns";
+ }
+ $command .= " $gff3F $gtfF";
+ &runCommand($command)
+}
+
+if ($gtfF ne "") {
$"=" ";
- $command = "rsem-extract-reference-transcripts $ARGV[1] $quiet $gtfF";
+ $gtf_sources =~ s/ /\\ /g;
+ $command = "rsem-extract-reference-transcripts $ARGV[1] $quiet $gtfF $gtf_sources";
if ($mappingF ne "") { $command .= " 1 $mappingF"; }
else { $command .= " 0"; }
$command .= " @list";
@@ -104,7 +118,8 @@ else {
&runCommand($command);
}
-$command = "rsem-preref $ARGV[1].transcripts.fa $polyAChoice $ARGV[1] -l $polyALen";
+$command = "rsem-preref $ARGV[1].transcripts.fa $polyAChoice $ARGV[1]";
+if ($polyAChoice != 1) { $command .= " -l $polyALen"; }
if ($polyAChoice == 2) { $command .= " -f $subsetFile"; }
if ($quiet) { $command .= " -q"; }
@@ -155,11 +170,11 @@ rsem-prepare-reference [options] reference_fasta_file(s) reference_name
=item B<reference_fasta_file(s)>
-Either a comma-separated list of Multi-FASTA formatted files OR a directory name. If a directory name is specified, RSEM will read all files with suffix ".fa" or ".fasta" in this directory. The files should contain either the sequences of transcripts or an entire genome, depending on whether the --gtf option is used.
+Either a comma-separated list of Multi-FASTA formatted files OR a directory name. If a directory name is specified, RSEM will read all files with suffix ".fa" or ".fasta" in this directory. The files should contain either the sequences of transcripts or an entire genome, depending on whether the '--gtf' option is used.
=item B<reference name>
-The name of the reference used. RSEM will generate several reference-related files that are prefixed by this name. This name can contain path information (e.g. /ref/mm9).
+The name of the reference used. RSEM will generate several reference-related files that are prefixed by this name. This name can contain path information (e.g. '/ref/mm9').
=back
@@ -171,10 +186,22 @@ The name of the reference used. RSEM will generate several reference-related fil
If this option is on, RSEM assumes that 'reference_fasta_file(s)' contains the sequence of a genome, and will extract transcript reference sequences using the gene annotations specified in <file>, which should be in GTF format.
-If this option is off, RSEM will assume 'reference_fasta_file(s)' contains the reference transcripts. In this case, RSEM assumes that name of each sequence in the Multi-FASTA files is its transcript_id.
+If this and '--gff3' options are off, RSEM will assume 'reference_fasta_file(s)' contains the reference transcripts. In this case, RSEM assumes that name of each sequence in the Multi-FASTA files is its transcript_id.
(Default: off)
+=item B<--gff3> <file>
+
+The annotation file is in GFF3 format instead of GTF format. RSEM will first convert it to GTF format with the file name 'reference_name.gtf'. Please make sure that 'reference_name.gtf' does not exist. (Default: off)
+
+=item B<--gff3-RNA-patterns> <pattern>
+
+<pattern> is a comma-separated list of transcript categories, e.g. "mRNA,rRNA". Only transcripts that match the <pattern> will be extracted. (Default: "mRNA")
+
+=item B<--trusted-sources> <sources>
+
+<sources> is a comma-separated list of trusted sources, e.g. "ENSEMBL,HAVANA". Only transcripts coming from these sources will be extracted. If this option is off, all sources are accepted. (Default: off)
+
=item B<--transcript-to-gene-map> <file>
Use information from <file> to map from transcript (isoform) ids to gene ids.
@@ -186,7 +213,7 @@ with the two fields separated by a tab character.
If you are using a GTF file for the "UCSC Genes" gene set from the UCSC Genome Browser, then the "knownIsoforms.txt" file (obtained from the "Downloads" section of the UCSC Genome Browser site) is of this format.
-If this option is off, then the mapping of isoforms to genes depends on whether the --gtf option is specified. If --gtf is specified, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file. Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene.
+If this option is off, then the mapping of isoforms to genes depends on whether the '--gtf' option is specified. If '--gtf' is specified, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file. Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene.
(Default: off)
diff --git a/rsem-refseq-extract-primary-assembly b/rsem-refseq-extract-primary-assembly
new file mode 100755
index 0000000..8289e70
--- /dev/null
+++ b/rsem-refseq-extract-primary-assembly
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+from sys import argv, exit
+
+if len(argv) != 3:
+ print("Usage: rsem-refseq-extract-primary-assembly input_top_level_assembly.fna output_primary_assembly.fna")
+ exit(-1)
+
+writeOut = True
+with open(argv[1]) as fin:
+ with open(argv[2], "w") as fout:
+ for line in fin:
+ line = line.strip()
+ if line[0] == '>':
+ writeOut = line.rfind("Primary Assembly") >= 0
+ if writeOut:
+ fout.write(line + "\n")
+
diff --git a/rsem-run-ebseq b/rsem-run-ebseq
index 47f7678..e8dde10 100755
--- a/rsem-run-ebseq
+++ b/rsem-run-ebseq
@@ -104,15 +104,15 @@ This file is only generated when there are more than 2 conditions. It gives the
=head1 EXAMPLES
-1) We're interested in isoform-level differential expression analysis and there are two conditions. Each condition has 5 replicates. We have already collected the data matrix as 'IsoMat' and generated ngvector as 'ngvector.ngvec':
+1) We're interested in isoform-level differential expression analysis and there are two conditions. Each condition has 5 replicates. We have already collected the data matrix as 'IsoMat.txt' and generated ngvector as 'ngvector.ngvec':
- rsem-run-ebseq --ngvector ngvector.ngvec IsoMat 5,5 IsoMat.results
+ rsem-run-ebseq --ngvector ngvector.ngvec IsoMat.txt 5,5 IsoMat.results
The results will be in 'IsoMat.results'.
-2) We're interested in gene-level analysis and there are 3 conditions. The first condition has 3 replicates and the other two has 4 replicates each. The data matrix is named as 'GeneMat':
+2) We're interested in gene-level analysis and there are 3 conditions. The first condition has 3 replicates and the other two has 4 replicates each. The data matrix is named as 'GeneMat.txt':
- rsem-run-ebseq GeneMat 3,4,4 GeneMat.results
+ rsem-run-ebseq GeneMat.txt 3,4,4 GeneMat.results
Three files, 'GeneMat.results', 'GeneMat.results.pattern', and 'GeneMat.results.condmeans', will be generated.
diff --git a/rsem_perl_utils.pm b/rsem_perl_utils.pm
index eaf7ea4..01e87e0 100644
--- a/rsem_perl_utils.pm
+++ b/rsem_perl_utils.pm
@@ -9,7 +9,7 @@ our @ISA = qw(Exporter);
our @EXPORT = qw(runCommand);
our @EXPORT_OK = qw(runCommand collectResults showVersionInfo);
-my $version = "RSEM v1.2.23";
+my $version = "RSEM v1.2.26";
# command, {err_msg}
sub runCommand {
@@ -33,11 +33,11 @@ sub runCommand {
print "\n";
}
-my @allele_title = ("allele_id", "transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "AlleleIsoPct", "AlleleGenePct", "posterior_mean_count", "posterior_standard_deviation_of_count", "pme_TPM", "pme_FPKM", "AlleleIsoPct_from_pme_TPM", "AlleleGenePct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound");
+my @allele_title = ("allele_id", "transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "AlleleIsoPct", "AlleleGenePct", "posterior_mean_count", "posterior_standard_deviation_of_count", "pme_TPM", "pme_FPKM", "AlleleIsoPct_from_pme_TPM", "AlleleGenePct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "TPM_coefficient_of_quartile_variation", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound", "FPKM_coefficient_of_quartile_variation");
-my @transcript_title = ("transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "IsoPct", "posterior_mean_count", "posterior_standard_deviation_of_count", "pme_TPM", "pme_FPKM", "IsoPct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound");
+my @transcript_title = ("transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "IsoPct", "posterior_mean_count", "posterior_standard_deviation_of_count", "pme_TPM", "pme_FPKM", "IsoPct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "TPM_coefficient_of_quartile_variation", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound", "FPKM_coefficient_of_quartile_variation");
-my @gene_title = ("gene_id", "transcript_id(s)", "length", "effective_length", "expected_count", "TPM", "FPKM", "posterior_mean_count", "posterior_standard_deviation_of_count", "pme_TPM", "pme_FPKM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound");
+my @gene_title = ("gene_id", "transcript_id(s)", "length", "effective_length", "expected_count", "TPM", "FPKM", "posterior_mean_count", "posterior_standard_deviation_of_count", "pme_TPM", "pme_FPKM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "TPM_coefficient_of_quartile_variation", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound", "FPKM_coefficient_of_quartile_variation");
# type, inpF, outF
sub collectResults {
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/rsem.git
More information about the debian-med-commit
mailing list