[med-svn] [rsem] 01/01: Imported Upstream version 1.2.31+dfsg
Michael Crusoe
misterc-guest at moszumanska.debian.org
Sat Oct 15 09:17:53 UTC 2016
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to annotated tag upstream/1.2.31+dfsg
in repository rsem.
commit fdf75d6c94a51ed3871944c89bf09bd029beb542
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date: Sat Jun 4 04:33:45 2016 -0700
Imported Upstream version 1.2.31+dfsg
---
WHAT_IS_NEW | 7 +
rsem-calculate-expression | 6 +-
rsem-gff3-to-gtf | 344 ++++++++++++++++++++++++++++++++++------------
rsem-prepare-reference | 4 +
rsem_perl_utils.pm | 14 +-
5 files changed, 284 insertions(+), 91 deletions(-)
diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW
index 3f0c1c6..f897484 100644
--- a/WHAT_IS_NEW
+++ b/WHAT_IS_NEW
@@ -1,3 +1,10 @@
+RSEM v1.2.31
+
+- Rewrote `rsem-gff3-to-gtf` to handle a more general set of GFF3 files
+- Added safety checks to make sure poly(A) tails are not added to the reference when `--star` is set
+
+--------------------------------------------------------------------------------------------
+
RSEM v1.2.30
- Fixed a bug that can cause SAMtools sort to fail
diff --git a/rsem-calculate-expression b/rsem-calculate-expression
index 808d839..d120bfa 100755
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -5,7 +5,7 @@ use Pod::Usage;
use File::Basename;
use FindBin;
use lib $FindBin::RealBin;
-use rsem_perl_utils qw(runCommand collectResults showVersionInfo getSAMTOOLS);
+use rsem_perl_utils qw(runCommand collectResults showVersionInfo getSAMTOOLS hasPolyA);
use Env qw(@PATH);
@@ -167,7 +167,7 @@ pod2usage(-verbose => 2) if ($help == 1);
if ($is_alignment) {
pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
- pod2usage(-msg => "--bowtie-path, --bowtie-n, --bowtie-e, --bowtie-m, --phred33-quals, --phred64-quals, --solexa-quals, --bowtie2, --bowtie2-path, --bowtie2-mismatch-rate, --bowtie2-k and --bowtie2-sensitivity-level cannot be set if input is SAM/BAM/CRAM format!", -exitval => 2, -verbose => 2) if ($bowtie_path ne "" || $C != 2 || $E != 99999999 || $maxHits != 200 || $phred33 || $phred64 || $solexa || $bowtie2 || $bowtie2_path ne "" || $bowtie2_mismatch_rate != 0.1 || $bowtie2_k != 20 [...]
+ pod2usage(-msg => "--bowtie-path, --bowtie-n, --bowtie-e, --bowtie-m, --phred33-quals, --phred64-quals, --solexa-quals, --bowtie2, --bowtie2-path, --bowtie2-mismatch-rate, --bowtie2-k, --bowtie2-sensitivity-level, --star, --star-path, and --star-output-genome-bam cannot be set if input is SAM/BAM/CRAM format!", -exitval => 2, -verbose => 2) if ($bowtie_path ne "" || $C != 2 || $E != 99999999 || $maxHits != 200 || $phred33 || $phred64 || $solexa || $bowtie2 || $bowtie2_path ne "" || $ [...]
}
else {
pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (!$paired_end && scalar(@ARGV) != 3 || $paired_end && scalar(@ARGV) != 4);
@@ -226,6 +226,8 @@ if (((-e "$refName.ta") && !(-e "$refName.gt")) || (!(-e "$refName.ta") && (-e "
$alleleS = (-e "$refName.ta") && (-e "$refName.gt");
+pod2usage(-msg => "RSEM reference cannot contain poly(A) tails if you want to use STAR aligner!", -exitval => 2, -verbose => 2) if ($star && (&hasPolyA("$refName.seq")));
+
if ($genGenomeBamF) {
open(INPUT, "$refName.ti");
my $line = <INPUT>; chomp($line);
diff --git a/rsem-gff3-to-gtf b/rsem-gff3-to-gtf
index 13c64be..344d869 100755
--- a/rsem-gff3-to-gtf
+++ b/rsem-gff3-to-gtf
@@ -3,103 +3,275 @@
import os
import sys
import argparse
-import re
+from operator import itemgetter
+
+type_gene = ["gene", "snRNA_gene", "transposable_element_gene", "ncRNA_gene", "telomerase_RNA_gene",
+ "rRNA_gene", "tRNA_gene", "snoRNA_gene", "mt_gene", "miRNA_gene", "lincRNA_gene", "RNA", "VD_gene_segment"]
+type_transcript = ["transcript", "primary_transcript", "mRNA", "ncRNA", "tRNA", "rRNA", "snRNA", "snoRNA", "miRNA",
+ "pseudogenic_transcript", "lincRNA", "NMD_transcript_variant", "aberrant_processed_transcript",
+ "nc_primary_transcript", "processed_pseudogene", "mRNA_TE_gene"]
+type_exon = ["exon", "CDS", "five_prime_UTR", "three_prime_UTR", "UTR", "noncoding_exon", "pseudogenic_exon"]
+
+# can be either gene or transcript, need special treatment
+type_gene_or_transcript = ["pseudogene", "V_gene_segment", "C_gene_segment", "J_gene_segment", "processed_transcript"]
+
class HelpOnErrorParser(argparse.ArgumentParser):
- def error(self, msg):
- sys.stderr.write("{0}: error: {1}\n\n".format(os.path.basename(sys.argv[0]), msg))
- self.print_help()
- sys.exit(-1)
+ def error(self, msg):
+ sys.stderr.write("{0}: error: {1}\n\n".format(os.path.basename(sys.argv[0]), msg))
+ self.print_help()
+ sys.exit(-1)
+
def my_assert(bool, msg):
- if not bool:
- sys.stderr.write(msg + "\n")
- sys.exit(-1)
-
-def findTag(tag, attributes):
- pos = attributes.find(tag + "=")
- if pos < 0:
- return None
- pos += len(tag) + 1
- rpos = attributes.find(";", pos)
- if rpos < 0:
- rpos = len(attributes)
- return attributes[pos:rpos]
+ if not bool:
+ sys.stderr.write(msg + "\n")
+ try:
+ os.remove(args.output_GTF_file)
+ except OSError:
+ pass
+ sys.exit(-1)
+
+
+class Feature:
+ # def gen_type_dict():
+ def gen_type_dict(self):
+ my_dict = {}
+ for my_type in type_gene:
+ my_dict[my_type] = "gene"
+ for my_type in type_transcript:
+ my_dict[my_type] = "transcript"
+ for my_type in type_exon:
+ my_dict[my_type] = "exon"
+
+ for my_type in type_gene_or_transcript:
+ my_dict[my_type] = "gene_or_transcript"
+
+ return my_dict
+
+ # type_dict = gen_type_dict()
+
+ def __init__(self):
+ self.type_dict = self.gen_type_dict()
+
+ def parse(self, line, line_no):
+ """ line should be free of leading and trailing spaces """
+
+ self.line = line
+ self.line_no = line_no
+
+ fields = line.split('\t')
+ my_assert(len(fields) == 9, "Line {0} does not have 9 fields:\n{1}".format(self.line_no, self.line))
+
+ self.seqid = fields[0]
+ self.source = fields[1]
+ self.original_type = fields[2]
+ self.feature_type = self.type_dict.get(fields[2], None)
+ self.start = int(fields[3])
+ self.end = int(fields[4])
+ self.strand = fields[6]
+ self.attributes = fields[8][:-1] if len(fields[8]) > 0 and fields[8][-1] == ';' else fields[8]
+
+ def parseAttributes(self):
+ self.attribute_dict = {}
+ for attribute in self.attributes.split(';'):
+ fields = attribute.split('=')
+ my_assert(len(fields) == 2, "Fail to parse attribute {0} of line {1}:\n{2}".format(attribute, self.line_no, self.line))
+ tag, value = fields
+ if tag == "Parent":
+ self.attribute_dict[tag] = value.split(',')
+ else:
+ self.attribute_dict[tag] = value
+
+ def getAttribute(self, tag, required = False):
+ value = self.attribute_dict.get(tag, None)
+ my_assert(not required or value != None, "Line {0} does not have attribute {1}:\n{2}".format(self.line_no, tag, self.line))
+ return value
+
+
+class Transcript:
+ def __init__(self, tid, feature):
+ self.tid = tid
+ self.tname = self.ttype = None
+ self.gid = self.gname = None
+ self.setT = False # if a transcript feature has been set
+
+ self.seqid = feature.seqid
+ # self.source = feature.source
+ self.source = None
+ self.strand = feature.strand
+
+ self.intervals = []
+
+ def setTranscript(self, feature):
+ my_assert(not self.setT,
+ "Transcript {0} appears multiple times! Last occurrence is at line {1}:\n{2}".format(self.tid, feature.line_no, feature.line))
+ self.setT = True
+ parents = feature.getAttribute("Parent", True)
+ my_assert(len(parents) == 1, "Transcript {0} at line {1} has more than one parents:\n{2}".format(self.tid, feature.line_no, feature.line))
+ self.gid = parents[0]
+ self.tname = feature.getAttribute("Name")
+ self.ttype = feature.original_type
+ self.source = feature.source
+
+ def addExon(self, feature):
+ self.intervals.append((feature.start, feature.end))
+
+ def merge(self):
+ self.intervals.sort(key = itemgetter(0))
+ self.results = []
+ cstart, cend = self.intervals[0]
+ for start, end in self.intervals[1:]:
+ if cend + 1 >= start:
+ cend = max(cend, end)
+ else:
+ self.results.append((cstart, cend))
+ cstart = start
+ cend = end
+ self.results.append((cstart, cend))
+
+ def __iter__(self):
+ self.index = 0
+ return self
+
+ def next(self):
+ if self.index == len(self.results):
+ raise StopIteration
+ interval = self.results[self.index]
+ self.index += 1
+ return interval
+
+
+def getTranscript(tid, feature):
+ assert tid != None
+
+ pos = tid2pos.get(tid, None)
+ if pos == None:
+ transcript = Transcript(tid, feature)
+ tid2pos[tid] = len(transcripts)
+ transcripts.append(transcript)
+ else:
+ my_assert(pos >= 0,
+ "Line {0} describes an already processed Transcript {1}:\n{2}".format(feature.line_no, tid, feature.line))
+ transcript = transcripts[pos]
+ my_assert(transcript.seqid == feature.seqid and transcript.strand == feature.strand,
+ "Line {0}'s seqid/strand is not consistent with other records of transcript {1}:\n{2}".format(
+ feature.line_no, tid, feature.line))
+
+ return transcript
+
+def flush_out(fout):
+ global transcripts
+ global tid2pos
+ global num_trans
+ global patterns
+
+ for transcript in transcripts:
+ tid2pos[transcript.tid] = -1
+ if not transcript.setT or len(transcript.intervals) == 0 or (len(patterns) > 0 and transcript.ttype not in patterns):
+ continue
+
+ my_assert(transcript.gid in gid2gname,
+ "Cannot recognize transcript {0}'s parent {1}, a gene feature might be missing.".format(transcript.tid, transcript.gid))
+
+ transcript.gname = gid2gname[transcript.gid]
+
+ transcript.merge()
+
+ output_string = "{0}\t{1}\texon\t{{0}}\t{{1}}\t.\t{2}\t.\tgene_id \"{3}\"; transcript_id \"{4}\";".format(
+ transcript.seqid, transcript.source, transcript.strand, transcript.gid, transcript.tid)
+ if transcript.gname != None:
+ output_string += " gene_name \"{0}\";".format(transcript.gname)
+ if transcript.tname != None:
+ output_string += " transcript_name \"{0}\";".format(transcript.tname)
+ output_string += "\n"
+
+ for start, end in transcript:
+ fout.write(output_string.format(start, end))
+
+ num_trans += 1
+
+ transcripts = []
+
+
parser = HelpOnErrorParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, description = "Convert GFF3 files to GTF files.")
parser.add_argument("input_GFF3_file", help = "Input GFF3 file.")
parser.add_argument("output_GTF_file", help = "Output GTF file.")
-parser.add_argument("--RNA-patterns", help = "Types of RNAs to be extracted.", default = "mRNA")
-
+parser.add_argument("--RNA-patterns", help = "Types of RNAs to be extracted, e.g. mRNA,rRNA", metavar = "<patterns>")
+parser.add_argument("--extract-sequences", help = "If GFF3 file contains reference sequences, extract them to the specified file", metavar = "<output.fa>")
args = parser.parse_args()
-trans2gene = {} # tid -> gid
-gid2name = {} # gid -> gene name
-tid2name = {} # tid -> transcript name
+patterns = set()
+if args.RNA_patterns != None:
+ patterns = set(args.RNA_patterns.split(','))
-rgx = re.compile("[\t]+")
-rgx2 = re.compile("^(" + "|".join(args.RNA_patterns.split(",")) + ")$")
+line_no = 0
+feature = Feature()
-fin = open(args.input_GFF3_file, "r")
-fout = open(args.output_GTF_file, "w")
+gid2gname = {}
-line_no = 0
+tid2pos = {}
+transcripts = []
+
+num_trans = 0
+
+reachFASTA = False
+
+with open(args.input_GFF3_file) as fin:
+ fout = open(args.output_GTF_file, "w")
+
+ for line in fin:
+ line = line.strip()
+ line_no += 1
+ if line_no % 100000 == 0:
+ print("Loaded {} lines".format(line_no))
+
+ if line.startswith("##FASTA"):
+ reachFASTA = True
+ break
+
+ if line.startswith("###"):
+ flush_out(fout)
+ continue
+
+ if line.startswith("#"):
+ continue
+
+ feature.parse(line, line_no)
+ if feature.feature_type == None:
+ continue
+ feature.parseAttributes()
+
+ if feature.feature_type == "gene_or_transcript":
+ parent = feature.getAttribute("Parent")
+ if parent == None:
+ feature.feature_type = "gene"
+ else:
+ feature.feature_type = "transcript"
+
+ if feature.feature_type == "gene":
+ gid = feature.getAttribute("ID", True)
+ my_assert(gid not in gid2gname,
+ "Gene {0} appears multiple times! Last occurrence is at line {1}:\n{2}".format(gid, feature.line_no, feature.line))
+ gid2gname[gid] = feature.getAttribute("Name")
+ elif feature.feature_type == "transcript":
+ transcript = getTranscript(feature.getAttribute("ID", True), feature)
+ transcript.setTranscript(feature)
+ else:
+ assert feature.feature_type == "exon"
+ for parent in feature.getAttribute("Parent", True):
+ transcript = getTranscript(parent, feature)
+ transcript.addExon(feature)
+
+ flush_out(fout)
+ fout.close()
+
+ print("GTF file is successully generated.")
+ print("There are {0} transcripts contained in the generated GTF file.".format(num_trans))
-for line in fin:
- line = line.rstrip("\r\n") # remove return characters
- line_no += 1
-
- if line.startswith("##FASTA"):
- break
- if line.startswith("#"):
- if line.startswith("###"):
- # clear all dictionaries
- trans2gene = {}
- gid2name = {}
- tid2name = {}
- continue
-
- arr = rgx.split(line)
- my_assert(len(arr) == 9, "Line {0} does not have 9 fields:\n{1}".format(line_no, line))
-
- if arr[2] == "exon":
- parent = findTag("Parent", arr[8])
- my_assert(parent != None, "Line {0} does not have a Parent tag:\n{1}".format(line_no, line))
-
- tids = parent.split(",")
- for tid in tids:
- if tid not in trans2gene:
- continue
- gid = trans2gene[tid]
- fout.write("{0}\tgene_id \"{1}\"; transcript_id \"{2}\";".format("\t".join(arr[:-1]), gid, tid))
- name = gid2name.get(gid, None)
- if name != None:
- fout.write(" gene_name \"{0}\";".format(name))
- name = tid2name.get(tid, None)
- if name != None:
- fout.write(" transcript_name \"{0}\";".format(name))
- fout.write("\n")
-
- elif arr[2] == "gene":
- gid = findTag("ID", arr[8])
- name = findTag("Name", arr[8])
-
- my_assert(gid != None, "Line {0} does not have a ID tag:\n{1}".format(line_no, line))
-
- if name != None:
- gid2name[gid] = name
-
- elif rgx2.search(arr[2]) != None:
- tid = findTag("ID", arr[8])
- gid = findTag("Parent", arr[8])
- name = findTag("Name", arr[8])
-
- my_assert(tid != None, "Line {0} does not have a ID tag:\n{1}".format(line_no, line))
- my_assert(gid != None, "Line {0} does not have a Parent tag:\n{1}".format(line_no, line))
-
- trans2gene[tid] = gid
- if name != None:
- tid2name[tid] = name
-
-fin.close()
-fout.close()
+ if reachFASTA and args.extract_sequences != None:
+ with open(args.extract_sequences, "w") as fout:
+ for line in fin:
+ fout.write(line)
+ print("FASTA file is successfully generated.")
\ No newline at end of file
diff --git a/rsem-prepare-reference b/rsem-prepare-reference
index 66e0e81..ed21552 100755
--- a/rsem-prepare-reference
+++ b/rsem-prepare-reference
@@ -63,9 +63,11 @@ pod2usage(-msg => "--transcript-to-gene-map and --allele-to-gene-map are mutuall
pod2usage(-msg => "--gtf and --gff3 are mutually exclusive!", -exitval => 2, -verbose => 2) if (($gtfF ne "") && ($gff3F ne ""));
pod2usage(-msg => "--gtf/--gff3 and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if ((($gtfF ne "") || ($gff3F ne "")) && ($alleleMappingF ne ""));
pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
+pod2usage(-msg => "No poly(A) tail should be added if --star is set!", -exitval => 2, -verbose => 2) if ($star && $polyA);
if (!$bowtie && ($bowtie_path ne "")) { print "Warning: If Bowtie is not used, no need to set --bowtie-path option!\n"; }
if (!$bowtie2 && ($bowtie2_path ne "")) { print "Warning: If Bowtie 2 is not used, no need to set --bowtie2-path option!\n"; }
+if (!$star && ($star_path ne "")) { print "Warning: If STAR is not used, no need to set --star-path option!\n"; }
my @list = split(/,/, $ARGV[0]);
my $size = scalar(@list);
@@ -142,6 +144,8 @@ if ($bowtie2) {
}
if ($star) {
+ pod2usage(-msg => "Sorry, if you want RSEM run STAR for you, you must provide the genome sequence and associated GTF annotation.", -exitval => 2, -verbose => 2) if ($gtfF eq "");
+
my $out_star_genome_path = dirname($ARGV[1]);
$command = $star_path . "STAR " .
" --runThreadN $star_nthreads " .
diff --git a/rsem_perl_utils.pm b/rsem_perl_utils.pm
index c6a5d94..783ee26 100644
--- a/rsem_perl_utils.pm
+++ b/rsem_perl_utils.pm
@@ -7,9 +7,9 @@ use strict;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(runCommand);
-our @EXPORT_OK = qw(runCommand collectResults showVersionInfo getSAMTOOLS);
+our @EXPORT_OK = qw(runCommand collectResults showVersionInfo getSAMTOOLS hasPolyA);
-my $version = "RSEM v1.2.30"; # Update version info here
+my $version = "RSEM v1.2.31"; # Update version info here
my $samtools = "samtools-1.3"; # If update to another version of SAMtools, need to change this
# command, {err_msg}
@@ -95,7 +95,15 @@ sub showVersionInfo {
}
sub getSAMTOOLS {
- return $samtools
+ return $samtools;
+}
+
+sub hasPolyA {
+ open(INPUT, $_[0]);
+ my $line = <INPUT>; chomp($line);
+ close(INPUT);
+ my ($fullLen, $totLen) = split(/ /, $line);
+ return $fullLen < $totLen;
}
1;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/rsem.git
More information about the debian-med-commit
mailing list