[med-svn] [tophat] 01/04: Imported Upstream version 2.0.10
Alex Mestiashvili
malex-guest at moszumanska.debian.org
Thu Nov 21 10:21:33 UTC 2013
This is an automated email from the git hooks/post-receive script.
malex-guest pushed a commit to branch master
in repository tophat.
commit e58d4d0b18548b92609ff3e3be5dab765474f273
Author: Alexandre Mestiashvili <alex at biotec.tu-dresden.de>
Date: Thu Nov 21 09:45:23 2013 +0100
Imported Upstream version 2.0.10
---
README | 2 +
THANKS | 36 ++-
configure | 24 +-
configure.ac | 2 +-
src/common.cpp | 4 +-
src/common.h | 4 +-
src/long_spanning_reads.cpp | 6 +-
src/prep_reads.cpp | 140 ++++++++----
src/reads.cpp | 191 +++++++---------
src/reads.h | 135 ++++++++---
src/sam_juncs.cpp | 2 -
src/segment_juncs.cpp | 38 +---
src/tophat-fusion-post | 7 +-
src/tophat.py | 114 +++++-----
src/tophat_reports.cpp | 528 ++++++++++++++++++++++++-------------------
15 files changed, 694 insertions(+), 539 deletions(-)
diff --git a/README b/README
index 37d9c05..f7ce76a 100644
--- a/README
+++ b/README
@@ -28,3 +28,5 @@ Samtools files as indicated (the header files and libbam.a)
TopHat also requires the Boost libraries (http://www.boost.org).
Please refer to the TopHat webpage for installation information.
+Note TopHat uses the SeqAn library that comes with the TopHat2 source code distribution.
+It is not necessary for the SeqAn library to be installed separately in order to run TopHat.
diff --git a/THANKS b/THANKS
index bd93110..a6c7889 100644
--- a/THANKS
+++ b/THANKS
@@ -1,4 +1,28 @@
-TopHat THANKS file
+TopHat and TopHat2 THANKS file
+
+TopHat2 was developed by Daehwan Kim, Geo Pertea, Cole Trapnell, and Steven Salzberg,
+with help of Harold Pimentel and Ryan Kelly. Lior Pachter and Adam Roberts contributed to
+the discussion of TopHat2. The following list shows our contacts and people
+who contributed to the development of TopHat2 by suggesting improvements and giving feedbacks
+as well as reporting bugs with their test data set.
+
+Arthur Delcher aldelcher at gmail.com
+Liliana Florea florea at jhu.edu
+Brian Haas bhaas at broadinstitute.org
+Kasper Hansen kasperdanielhansen at gmail.com
+Ryan Kelley ryankelly at illumina.com
+Daehwan Kim infphilo at gmail.com
+Ben Langmead langmea at cs.jhu.edu
+Lior Pachter lpachter at math.berkeley.edu
+Ela Pertea mpertea at jhu.edu
+Geo Pertea gpertea at jhu.edu
+Harold Pimentel pimentel at cs.berkeley.edu
+Nathalie Pochet npochet at broadinstitute.org
+Adam Roberts adarob at gmail.com
+Cole Trapnell cole at broadinstitute.org
+Steven Salzberg salzberg at jhu.edu
+Alec Wysoker alecw at broadinstitute.org
+
TopHat was originally written by Cole Trapnell, with substantial creative
input from Lior Pachter and Steven Salzberg. Many people further contributed
@@ -10,20 +34,20 @@ Robert Bradley rbradley at math.berkeley.edu
Steven Brenner brenner at compbio.berkeley.edu
Angela Brooks angelabrooks at berkeley.edu
Mark Diekhans markd at soe.ucsc.edu
-Mike Duff moduff at gmail.com
+Mike Duff moduff at gmail.com
Steffen Durinck sdurinck at illumina.com
Matthew Fitzgibbon mfitzgib at fhcrc.org
Brenton Graveley graveley at neuron.uchc.edu
-Kasper Hansen khansen at stat.berkeley.edu
+Kasper Hansen kasperdanielhansen at gmail.com
Rachel Harte hartera at soe.ucsc.edu
-Ben Langmead langmead at cs.umd.edu
+Ben Langmead langmea at cs.jhu.edu
Richard McCombie mccombie at cshl.edu
Ali Mortazavi alim at caltech.edu
Adam Phillippy amp at umiacs.umd.edu
-Mike Schatz mschatz at umiacs.umd.edu
+Mike Schatz mschatz at cshl.edu
Gary Schroth gschroth at illumina.com
Gavin Sherlock sherlock at genome.stanford.edu
-Diane Trout diane at caltech.edu
+Diane Trout diane at caltech.edu
Svilen Tzonev stzonev at illumina.com
Jeltje Van Baren jeltje at wustl.edu
Barbara Wold wold at caltech.edu
diff --git a/configure b/configure
index 0553e54..932984f 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.59 for tophat 2.0.9.
+# Generated by GNU Autoconf 2.59 for tophat 2.0.10.
#
# Report bugs to <tophat.cufflinks at gmail.com>.
#
@@ -269,8 +269,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
# Identity of this package.
PACKAGE_NAME='tophat'
PACKAGE_TARNAME='tophat'
-PACKAGE_VERSION='2.0.9'
-PACKAGE_STRING='tophat 2.0.9'
+PACKAGE_VERSION='2.0.10'
+PACKAGE_STRING='tophat 2.0.10'
PACKAGE_BUGREPORT='tophat.cufflinks at gmail.com'
ac_unique_file="config.h.in"
@@ -792,7 +792,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures tophat 2.0.9 to adapt to many kinds of systems.
+\`configure' configures tophat 2.0.10 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -858,7 +858,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of tophat 2.0.9:";;
+ short | recursive ) echo "Configuration of tophat 2.0.10:";;
esac
cat <<\_ACEOF
@@ -1009,7 +1009,7 @@ fi
test -n "$ac_init_help" && exit 0
if $ac_init_version; then
cat <<\_ACEOF
-tophat configure 2.0.9
+tophat configure 2.0.10
generated by GNU Autoconf 2.59
Copyright (C) 2003 Free Software Foundation, Inc.
@@ -1023,7 +1023,7 @@ cat >&5 <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by tophat $as_me 2.0.9, which was
+It was created by tophat $as_me 2.0.10, which was
generated by GNU Autoconf 2.59. Invocation command line was
$ $0 $@
@@ -1361,7 +1361,7 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
cat >>confdefs.h <<\_ACEOF
-#define SVN_REVISION "4112M"
+#define SVN_REVISION "4167"
_ACEOF
@@ -1675,7 +1675,7 @@ fi
# Define the identity of the package.
PACKAGE='tophat'
- VERSION='2.0.9'
+ VERSION='2.0.10'
cat >>confdefs.h <<_ACEOF
@@ -7134,7 +7134,7 @@ fi
# Define the identity of the package.
PACKAGE='tophat'
- VERSION='2.0.9'
+ VERSION='2.0.10'
cat >>confdefs.h <<_ACEOF
@@ -7966,7 +7966,7 @@ _ASBOX
} >&5
cat >&5 <<_CSEOF
-This file was extended by tophat $as_me 2.0.9, which was
+This file was extended by tophat $as_me 2.0.10, which was
generated by GNU Autoconf 2.59. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -8029,7 +8029,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF
ac_cs_version="\\
-tophat config.status 2.0.9
+tophat config.status 2.0.10
configured by $0, generated by GNU Autoconf 2.59,
with options \\"`echo "$ac_configure_args" | sed 's/[\\""\`\$]/\\\\&/g'`\\"
diff --git a/configure.ac b/configure.ac
index 3d00c55..7fc65ab 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4,7 +4,7 @@ m4_include([ax_bam.m4])
#m4_include([ax_check_zlib.m4])
define([svnversion], esyscmd([sh -c "svnversion|tr -d '\n'"]))dnl
-AC_INIT([tophat],[2.0.9],[tophat.cufflinks at gmail.com])
+AC_INIT([tophat],[2.0.10],[tophat.cufflinks at gmail.com])
AC_DEFINE(SVN_REVISION, "svnversion", [SVN Revision])
AC_CONFIG_SRCDIR([config.h.in])
diff --git a/src/common.cpp b/src/common.cpp
index 64cb587..bc6664e 100644
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -126,7 +126,7 @@ int read_edit_dist = 2;
int read_realign_edit_dist = 2;
int max_splice_mismatches = 1;
-ReadFormat reads_format = FASTQ;
+//ReadFormat reads_format = FASTQ;
bool verbose = false;
@@ -465,12 +465,14 @@ int parse_options(int argc, char** argv, void (*print_usage)())
switch (next_option) {
case -1:
break;
+ /*
case OPT_FASTA:
reads_format = FASTA;
break;
case OPT_FASTQ:
reads_format = FASTQ;
break;
+ */
case OPT_MIN_ANCHOR:
min_anchor_len = (uint32_t)parseIntOpt(3, "--min-anchor arg must be at least 3", print_usage);
break;
diff --git a/src/common.h b/src/common.h
index 40e42d3..49f904e 100644
--- a/src/common.h
+++ b/src/common.h
@@ -90,8 +90,8 @@ extern int read_realign_edit_dist;
extern int max_splice_mismatches;
-enum ReadFormat {FASTA, FASTQ};
-extern ReadFormat reads_format;
+enum ReadFormat {FASTX_AUTO=0, FASTA, FASTQ};
+//extern ReadFormat reads_format;
extern bool verbose;
extern unsigned int max_multihits;
diff --git a/src/long_spanning_reads.cpp b/src/long_spanning_reads.cpp
index d07ff19..598e4a9 100644
--- a/src/long_spanning_reads.cpp
+++ b/src/long_spanning_reads.cpp
@@ -2583,11 +2583,11 @@ bool dfs_seg_hits(RefSequenceTable& rt,
if (success)
join_success = true;
- if (num_try <= 0)
- return join_success;
-
seg_hit_stack.pop_back();
seg_hit_stack.back() = tempHit;
+
+ if (num_try <= 0)
+ return join_success;
}
}
}
diff --git a/src/prep_reads.cpp b/src/prep_reads.cpp
index 897093e..a989dbf 100644
--- a/src/prep_reads.cpp
+++ b/src/prep_reads.cpp
@@ -44,7 +44,7 @@ void load_readmap(string& flt_fname, vector<bool>& rmap) {
if (flt_fname.empty()) return;
ReadStream rdstream(flt_fname, NULL, true);
Read read;
- while (rdstream.get_direct(read, reads_format)) {
+ while (rdstream.get_direct(read)) {
uint32_t rnum=(uint32_t)atol(read.name.c_str());
if (rnum>=(uint32_t) rmap.size())
rmap.resize(rnum+1, false);
@@ -209,7 +209,7 @@ void writePrepBam(GBamWriter* wbam, Read& read, uint32_t rid, char trashcode=0,
wbam->write(bamrec.get_b(), rid);
}
-bool processRead(int matenum, Read& read, uint32_t next_id, int& num_reads_chucked,
+bool processRead(int matenum, Read& read, ReadFormat rd_format, uint32_t next_id, int& num_reads_chucked,
int& multimap_chucked, GBamWriter* wbam, FILE* fout, FILE* fqindex,
int& min_read_len, int& max_read_len, uint64_t& fout_offset, vector<bool>& rmap) {
if (read.seq.length()<12) {
@@ -269,7 +269,7 @@ bool processRead(int matenum, Read& read, uint32_t next_id, int& num_reads_chuc
}
if (wbam) {
- if (reads_format == FASTA && !quals)
+ if (rd_format == FASTA && !quals)
read.qual = string(read.seq.length(), 'I').c_str();
else if (color && quals)
read.qual = "!" + read.qual;
@@ -281,7 +281,7 @@ bool processRead(int matenum, Read& read, uint32_t next_id, int& num_reads_chuc
// because it may contain some control characters such as "\" from quality values.
// Here, buf is only used for calculating the file offset
char buf[2048] = {0};
- if (reads_format == FASTQ or (reads_format == FASTA && quals))
+ if (rd_format == FASTQ or (rd_format == FASTA && quals))
{
sprintf(buf, "@%u\n%s\n+%s\n%s\n",
next_id,
@@ -295,7 +295,7 @@ bool processRead(int matenum, Read& read, uint32_t next_id, int& num_reads_chuc
read.name.c_str(),
read.qual.c_str());
}
- else if (reads_format == FASTA)
+ else if (rd_format == FASTA)
{
string qual;
if (color)
@@ -342,25 +342,30 @@ void process_reads(vector<string>& reads_fnames, vector<FZPipe>& quals_files,
int multimap_chucked = 0;
int mate_num_reads_chucked = 0;
int mate_multimap_chucked = 0;
- int min_read_len = 20000000;
- int max_read_len = 0;
- int mate_min_read_len = 20000000;
- int mate_max_read_len = 0;
+ int left_min_len = 20000000;
+ int left_max_len = 0;
+ int unpaired_min_len=left_min_len;
+ int unpaired_max_len=left_max_len;
+ int unpaired_num_chucked=0;
+ int unpaired_multimap_chucked=0;
+ int right_min_len = 20000000;
+ int right_max_len = 0;
uint32_t next_id = 0;
+ uint32_t unpaired_count=0; //only used when PE_data + extra unpaired reads given
uint32_t num_left = 0;
uint32_t num_mates = 0;
+ bool PE_data = (mate_fnames.size() > 0);
FILE* fw=NULL; //aux output file
string outfname; //std_outfile after instancing template
string mate_outfname;
string idxfname; //index_outfile after instancing template
string mate_idxfname;
- bool have_mates = (mate_fnames.size() > 0);
if (!aux_outfile.empty()) {
fw=fopen(aux_outfile.c_str(), "w");
if (fw==NULL)
- err_die(ERR_FILE_CREATE,aux_outfile.c_str());
+ err_die(ERR_FILE_CREATE, aux_outfile.c_str());
}
FILE* fqindex = NULL; //fastq index
@@ -374,7 +379,7 @@ void process_reads(vector<string>& reads_fnames, vector<FZPipe>& quals_files,
if (std_outfile.empty()) {
fout=stdout;
//for PE reads, flt_side will decide which side is printed (can't be both)
- if (have_mates && flt_side==2)
+ if (PE_data && flt_side==2)
err_die("Error: --flt-side option required for PE reads directed to stdout!\n");
mate_fout=stdout;
}
@@ -382,7 +387,7 @@ void process_reads(vector<string>& reads_fnames, vector<FZPipe>& quals_files,
//could be a template
if (std_outfile.find("%side%") != string::npos) {
outfname=str_replace(std_outfile, "%side%", "left");
- if (have_mates)
+ if (PE_data)
mate_outfname=str_replace(std_outfile, "%side%", "right");
}
else {
@@ -390,7 +395,7 @@ void process_reads(vector<string>& reads_fnames, vector<FZPipe>& quals_files,
}
if (index_outfile.find("%side%") != string::npos) {
idxfname=str_replace(index_outfile, "%side%", "left");
- if (have_mates)
+ if (PE_data)
mate_idxfname=str_replace(index_outfile, "%side%", "right");
}
else {
@@ -431,30 +436,35 @@ void process_reads(vector<string>& reads_fnames, vector<FZPipe>& quals_files,
Read read;
Read mate_read;
ReadStream* reads=NULL;
+ ReadFormat rd_format=FASTX_AUTO;
ReadStream* mate_reads=NULL;
+ ReadFormat mate_rd_format=FASTX_AUTO;
FZPipe* fq=NULL;
FZPipe* mate_fq=NULL;
bool have_l_reads=(fi<reads_fnames.size());
- bool have_r_reads=(have_mates && fi<mate_fnames.size());
+ bool have_r_reads=(PE_data && fi<mate_fnames.size());
if (have_l_reads) {
if (quals)
fq = & quals_files[fi];
reads=new ReadStream(reads_fnames[fi], fq, true);
+ rd_format=reads->get_format();
}
if (have_r_reads) {
if (quals)
mate_fq = & mate_quals_files[fi];
mate_reads=new ReadStream(mate_fnames[fi], mate_fq, true);
+ mate_rd_format=mate_reads->get_format();
}
while (have_l_reads || have_r_reads) {
- if (have_l_reads && (have_l_reads=reads->get_direct(read, reads_format)))
+ if (have_l_reads && (have_l_reads=reads->get_direct(read)))
num_left++;
// Get the next read from the file
int matenum=0; // 0 = unpaired, 1 = left, 2 = right
- if (have_r_reads && (have_r_reads=mate_reads->get_direct(mate_read, reads_format)) ) {
+ if (have_r_reads && (have_r_reads=mate_reads->get_direct(mate_read)) ) {
num_mates++;
}
+ bool have_reads = (have_l_reads || have_r_reads);
if (have_l_reads && have_r_reads) {
matenum = 1; //read is first in a pair
if (have_l_reads && have_r_reads && !possible_mate_mismatch) {
@@ -478,19 +488,56 @@ void process_reads(vector<string>& reads_fnames, vector<FZPipe>& quals_files,
}
} //mate check
} //paired reads
- if (have_l_reads || have_r_reads) {
+ else if (PE_data && have_reads) {
+ ++unpaired_count;
+ if (have_l_reads) --num_left;
+ else --num_mates; //if (have_r_reads)
+ }
+ if (have_reads) {
//IMPORTANT: to keep paired reads in sync, this must be
//incremented BEFORE any reads are chucked !
++next_id;
}
- if ((flt_side & 1)==0 && have_l_reads)
- //for unpaired reads or left read in a pair
- processRead(matenum, read, next_id, num_reads_chucked, multimap_chucked,
- wbam, fout, fqindex, min_read_len, max_read_len, fout_offset, readmap);
+ int* min_rd_len=NULL, *max_rd_len=NULL,
+ *num_rd_chucked=NULL, *num_multimap_chucked=NULL;
+ if ((flt_side & 1)==0 && have_l_reads) {
+ if (PE_data && matenum==0) {
+ //extra unpaired read
+ min_rd_len=&unpaired_min_len;
+ max_rd_len=&unpaired_max_len;
+ num_rd_chucked=&unpaired_num_chucked;
+ num_multimap_chucked=&unpaired_multimap_chucked;
+ }
+ else { //paired read
+ min_rd_len=&left_min_len;
+ max_rd_len=&left_max_len;
+ num_rd_chucked=&num_reads_chucked;
+ num_multimap_chucked=&multimap_chucked;
+ }
+ processRead(matenum, read, rd_format, next_id, *num_rd_chucked, *num_multimap_chucked,
+ wbam, fout, fqindex, *min_rd_len, *max_rd_len, fout_offset, readmap);
+
+ }
if (flt_side>0 && have_r_reads) {
- matenum = have_l_reads ? 2 : 0;
- processRead(matenum, mate_read, next_id, mate_num_reads_chucked, mate_multimap_chucked,
- mate_wbam, mate_fout, mate_fqindex, mate_min_read_len, mate_max_read_len,
+ //matenum = have_l_reads ? 2 : 0;
+ if (have_l_reads) {
+ matenum = 2;
+ min_rd_len=&right_min_len;
+ max_rd_len=&right_max_len;
+ num_rd_chucked=&mate_num_reads_chucked;
+ num_multimap_chucked=&mate_multimap_chucked;
+ }
+ else { //paired read
+ //extra unpaired read
+ matenum = 0;
+ min_rd_len=&unpaired_min_len;
+ max_rd_len=&unpaired_max_len;
+ num_rd_chucked=&unpaired_num_chucked;
+ num_multimap_chucked=&unpaired_multimap_chucked;
+ }
+
+ processRead(matenum, mate_read, mate_rd_format, next_id, *num_rd_chucked, *num_multimap_chucked,
+ mate_wbam, mate_fout, mate_fqindex, *min_rd_len, *max_rd_len,
mate_fout_offset, mate_readmap);
}
} //while !fr.isEof()
@@ -507,35 +554,42 @@ void process_reads(vector<string>& reads_fnames, vector<FZPipe>& quals_files,
multimap_chucked, flt_reads_fnames[0].c_str());
}
- if (have_mates && (fout!=stdout || flt_side>0)) {
+ if (PE_data && (fout!=stdout || flt_side>0)) {
fprintf(stderr, "%u out of %u read mates have been filtered out\n",
mate_num_reads_chucked, num_mates);
if (readmap_loaded && mate_multimap_chucked)
fprintf(stderr, "\t(%u mates filtered out due to %s)\n",
mate_multimap_chucked, flt_reads_fnames[1].c_str());
}
-
+ //we should also print to stderr stats related to the extra unpaired reads here, if any
if (wbam) { delete wbam; }
if (mate_wbam) { delete mate_wbam; }
if (fout && fout!=stdout) fclose(fout);
if (mate_fout) fclose(mate_fout);
if (fw!=NULL) {
- string side("");
- if (have_mates)
- side="left_";
- fprintf(fw, "%smin_read_len=%d\n", side.c_str(), min_read_len - (color ? 1 : 0));
- fprintf(fw, "%smax_read_len=%d\n", side.c_str(), max_read_len - (color ? 1 : 0));
- fprintf(fw, "%sreads_in =%d\n", side.c_str(), num_left);
- fprintf(fw, "%sreads_out=%d\n", side.c_str(), num_left-num_reads_chucked);
- if (have_mates) {
- side="right_";
- fprintf(fw, "%smin_read_len=%d\n", side.c_str(), mate_min_read_len - (color ? 1 : 0));
- fprintf(fw, "%smax_read_len=%d\n", side.c_str(), mate_max_read_len - (color ? 1 : 0));
- fprintf(fw, "%sreads_in =%d\n", side.c_str(), num_mates);
- fprintf(fw, "%sreads_out=%d\n", side.c_str(), num_mates-mate_num_reads_chucked);
- }
- fclose(fw);
- }
+ string side("");
+ if (PE_data)
+ side="left_";
+ fprintf(fw, "%smin_read_len=%d\n", side.c_str(), left_min_len - (color ? 1 : 0));
+ fprintf(fw, "%smax_read_len=%d\n", side.c_str(), left_max_len - (color ? 1 : 0));
+ fprintf(fw, "%sreads_in =%d\n", side.c_str(), num_left);
+ fprintf(fw, "%sreads_out=%d\n", side.c_str(), num_left-num_reads_chucked);
+ if (PE_data) {
+ side="right_";
+ fprintf(fw, "%smin_read_len=%d\n", side.c_str(), right_min_len - (color ? 1 : 0));
+ fprintf(fw, "%smax_read_len=%d\n", side.c_str(), right_max_len - (color ? 1 : 0));
+ fprintf(fw, "%sreads_in =%d\n", side.c_str(), num_mates);
+ fprintf(fw, "%sreads_out=%d\n", side.c_str(), num_mates-mate_num_reads_chucked);
+ if (unpaired_count) {
+ side="unpaired_";
+ fprintf(fw, "%smin_read_len=%d\n", side.c_str(), unpaired_min_len - (color ? 1 : 0));
+ fprintf(fw, "%smax_read_len=%d\n", side.c_str(), unpaired_max_len - (color ? 1 : 0));
+ fprintf(fw, "%sreads_in =%d\n", side.c_str(), unpaired_count);
+ fprintf(fw, "%sreads_out=%d\n", side.c_str(), unpaired_count-unpaired_num_chucked);
+ }
+ }
+ fclose(fw);
+ }
if (fqindex) fclose(fqindex);
if (mate_fqindex) fclose(mate_fqindex);
diff --git a/src/reads.cpp b/src/reads.cpp
index 73dc58b..d46debf 100644
--- a/src/reads.cpp
+++ b/src/reads.cpp
@@ -60,118 +60,35 @@ char* FLineReader::nextLine() {
return buf;
}
-void skip_lines(FLineReader& fr)
+ReadFormat skip_lines(FLineReader& fr)
{
- if (fr.fhandle() == NULL) return;
+ if (fr.fhandle() == NULL) return FASTX_AUTO;
char* buf = NULL;
+ ReadFormat rfmt=FASTX_AUTO;
+ int lcount=0;
while ((buf = fr.nextLine()) != NULL) {
- if (buf[0] == '\0') continue;
- if (buf[0] == '>' || buf[0] == '@')
- {
- fr.pushBack();
+ ++lcount;
+ int i=0;
+ while (buf[i]!=0 && buf[i]==' ') ++i;
+ if (buf[i] == '\0') continue; //skip empty lines
+ if (buf[0] == '>') rfmt=FASTA;
+ if (buf[0] == '@') rfmt=FASTQ;
+ if (rfmt) {
+ fr.pushBack();
+ break;
+ }
+ /*
+ if (lcount==500) { //skipped 500 lines already, something is wrong here (large header?!?)
+ warn_msg("Warning: cannot determine file format! (large header?)\n");
break;
- }
+ }
+ */
}
-}
-
-bool next_fasta_record(FLineReader& fr,
- string& defline,
- string& seq,
- ReadFormat reads_format)
-
-{
- seq.clear();
- defline.clear();
- char* buf=NULL;
- while ((buf=fr.nextLine())!=NULL) {
- if (buf[0]==0) continue; //skip empty lines
- if ((reads_format == FASTA && buf[0] == '>') || (reads_format == FASTQ && (buf[0] == '+' || buf[0] == '@'))) { //next record
- if (seq.length()>0) { //current record ending
- fr.pushBack();
- return true;
- }
- defline=buf+1;
- string::size_type space_pos = defline.find_first_of(" \t");
- if (space_pos != string::npos) {
- defline.resize(space_pos);
- }
- continue;
- } //defline
- // sequence line
- seq.append(buf);
- } //line reading loop
- replace(seq.begin(), seq.end(), '.', color ? '4' : 'N'); //shouldn't really be needed for FASTA files
- return !(seq.empty());
+ return rfmt;
}
-bool next_fastq_record(FLineReader& fr,
- const string& seq,
- string& alt_name,
- string& qual,
- ReadFormat reads_format)
-{
- alt_name.clear();
- qual.clear();
- char* fline=fr.nextLine();
- if (fline==NULL) return false;
- while (fline[0]==0) { //skip empty lines
- fline=fr.nextLine();
- if (fline==NULL) return false;
- }
- //must be on '+' line here
- if (fline==NULL || (reads_format == FASTQ && fline[0] != '+') ||
- (reads_format == FASTA && quals && fline[0] != '>')) {
- err_exit("Error: '+' not found for fastq record %s\n",fline);
- return false;
- }
- alt_name=fline+1;
- string::size_type space_pos = alt_name.find_first_of(" \t");
- if (space_pos != string::npos) alt_name.resize(space_pos);
- //read qv line(s) now:
- while ((fline=fr.nextLine())!=NULL) {
- if (integer_quals)
- {
- vector<string> integer_qual_values;
- tokenize(string(fline), " ", integer_qual_values);
-
- string temp_qual;
- for (size_t i = 0; i < integer_qual_values.size(); ++i)
- {
- int qual_value = atoi(integer_qual_values[i].c_str());
- if (qual_value < 0) qual_value = 0;
- temp_qual.push_back((char)(qual_value + 33));
- }
-
- qual.append(temp_qual);
- }
- else
- qual.append(fline);
- if (qual.length()>=seq.length()-1) break;
- }
- // final check
- if (color) {
- if (seq.length()==qual.length()) {
- //discard first qv
- qual=qual.substr(1);
- }
- if (seq.length()!=qual.length()+1) {
- err_exit("Error: length of quality string does not match seq length (%d) for color read %s!\n",
- seq.length(), alt_name.c_str());
- }
- }
- else {
- if (seq.length()!=qual.length()) {
- err_exit("Error: qual string length (%d) differs from seq length (%d) for read %s!\n",
- qual.length(), seq.length(), alt_name.c_str());
- //return false;
- }
- }
- //
- return !(qual.empty());
-}
-
-bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat reads_format,
+bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat rd_format,
FLineReader* frq) {
/*
if (fr.pushed_read)
@@ -185,8 +102,8 @@ bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat reads_format,
char* buf=NULL;
while ((buf=fr.nextLine())!=NULL) {
if (buf[0]==0) continue; //skip empty lines
- if ((reads_format == FASTA && buf[0] == '>') ||
- (reads_format == FASTQ && (buf[0] == '+' || buf[0] == '@'))) { //next record
+ if ((rd_format == FASTA && buf[0] == '>') ||
+ (rd_format == FASTQ && (buf[0] == '+' || buf[0] == '@'))) { //next record
if (read.seq.length()>0) { //current record ending
fr.pushBack();
break;
@@ -203,7 +120,7 @@ bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat reads_format,
} //line reading loop
replace(read.seq.begin(), read.seq.end(), '.', color ? '4' : 'N'); //shouldn't really be needed for FASTA files
- if (reads_format != FASTQ && frq==NULL)
+ if (rd_format != FASTQ && frq==NULL)
return (!read.seq.empty());
if (frq==NULL) frq=&fr; //FASTQ
//FASTQ or quals in a separate file -- now read quality values
@@ -214,8 +131,8 @@ bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat reads_format,
if (buf==NULL) return false;
}
//must be on '+' line here
- if (buf==NULL || (reads_format == FASTQ && buf[0] != '+') ||
- (reads_format == FASTA && buf[0] != '>')) {
+ if (buf==NULL || (rd_format == FASTQ && buf[0] != '+') ||
+ (rd_format == FASTA && buf[0] != '>')) {
err_exit("Error: beginning of quality values record not found! (%s)\n",buf);
return false;
}
@@ -605,11 +522,11 @@ void bam2Read(bam1_t *b, Read& rd, bool alt_name=false) {
}
-bool ReadStream::next_read(QReadData& rdata, ReadFormat read_format) {
+bool ReadStream::next_read(QReadData& rdata) {
while (read_pq.size()<ReadBufSize && !r_eof) {
//keep the queue topped off
Read rf;
- if (get_direct(rf, read_format)) {
+ if (get_direct(rf)) {
uint64_t id = (uint64_t)atol(rf.name.c_str());
QReadData rdata(id, rf, last_b());
read_pq.push(rdata);
@@ -623,7 +540,7 @@ bool ReadStream::next_read(QReadData& rdata, ReadFormat read_format) {
return true;
}
-bool ReadStream::get_direct(Read& r, ReadFormat read_format) {
+bool ReadStream::get_direct(Read& r) {
if (fstream.file==NULL) return false;
if (fstream.is_bam) {
bool got_read=false;
@@ -650,7 +567,6 @@ bool ReadStream::get_direct(Read& r, ReadFormat read_format) {
// reads must ALWAYS be requested in increasing order of their ID
bool ReadStream::getRead(uint64_t r_id,
Read& read,
- ReadFormat read_format,
bool strip_slash,
uint64_t begin_id,
uint64_t end_id,
@@ -672,7 +588,7 @@ bool ReadStream::getRead(uint64_t r_id,
read.clear();
while (!found) {
QReadData rdata;
- if (!next_read(rdata, read_format))
+ if (!next_read(rdata))
break;
/*
if (strip_slash) {
@@ -702,7 +618,52 @@ bool ReadStream::getRead(uint64_t r_id,
break;
}
if (rProc) { //skipped read processing (unmapped reads)
- if (!rProc->process(rdata, found, is_unmapped))
+ if (!rProc->process(rdata, found)) //, is_unmapped))
+ // rProc->process() should normally return TRUE
+ return false; //abort search for r_id, return "not found"
+ }
+ } //while target read id not found
+ return found;
+}
+
+bool ReadStream::getQRead(uint64_t r_id,
+ QReadData& qread,
+ uint64_t begin_id,
+ uint64_t end_id,
+ GetReadProc* rProc,
+ bool is_unmapped )
+ {
+ if (!fstream.file)
+ err_die("Error: calling ReadStream::getRead() with no file handle!");
+ if (r_id<last_id)
+ err_die("Error: ReadStream::getRead() called with out-of-order id#!");
+ last_id=r_id;
+ bool found=false;
+ qread.clear();
+ while (!found) {
+ QReadData rdata;
+ if (!next_read(rdata))
+ break;
+ if (rdata.id >= end_id)
+ return false;
+
+ if (rdata.id < begin_id)
+ continue; //silently skip until begin_id found
+ //does not trigger rProc->process() until begin_id
+ if (rdata.id == r_id)
+ {
+ qread=rdata; //it will be returned
+ found=true;
+ }
+ else if (rdata.id > r_id)
+ { //can't find it, went too far
+ //only happens when reads [mates] were removed for some reason
+ //read_pq.push(make_pair(id, read));
+ read_pq.push(rdata);
+ break;
+ }
+ if (rProc) { //skipped read processing (unmapped reads)
+ if (!rProc->process(rdata, found)) //, is_unmapped))
// rProc->process() should normally return TRUE
return false; //abort search for r_id, return "not found"
}
diff --git a/src/reads.h b/src/reads.h
index 7bf300e..9af4a7d 100644
--- a/src/reads.h
+++ b/src/reads.h
@@ -134,10 +134,8 @@ public:
}
};
-void skip_lines(FLineReader& fr);
-bool next_fasta_record(FLineReader& fr, string& defline, string& seq, ReadFormat reads_format);
-bool next_fastq_record(FLineReader& fr, const string& seq, string& alt_name, string& qual, ReadFormat reads_format);
-bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat reads_format=FASTQ,
+ReadFormat skip_lines(FLineReader& fr);
+bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat rd_format=FASTQ,
FLineReader* frq=NULL);
#define READSTREAM_BUF_SIZE 500000
@@ -147,9 +145,17 @@ struct QReadData { //read data for the priority queue
Read read;
char trashCode; //ZT tag value
int8_t matenum; //mate number (1,2) 0 if unpaired
- QReadData():id(0),read(),trashCode(0), matenum(0) { }
- QReadData(uint64_t rid, Read& rd, bam1_t* bd=NULL):
- id(rid), read(rd), trashCode(0), matenum(0) {
+ bool um_written;
+ QReadData():id(0),read(),trashCode(0), matenum(0), um_written(false) { }
+ QReadData(uint64_t rid, Read& rd, bam1_t* bd=NULL) {
+ init(rid, rd, bd);
+ }
+ void init(uint64_t rid, Read& rd, bam1_t* bd=NULL) {
+ id=rid;
+ read=rd;
+ trashCode=0;
+ matenum=0;
+ um_written=false;
if (bd) {
if (bd->core.flag & BAM_FREAD1) {
matenum=1;
@@ -160,6 +166,14 @@ struct QReadData { //read data for the priority queue
}
}
+ void clear() {
+ id=0;
+ read.clear();
+ trashCode=0;
+ matenum=0;
+ um_written=false;
+ }
+
};
@@ -168,14 +182,48 @@ struct GetReadProc {
GBamWriter* um_out; //skipped (unmapped) reads will be written here
int64_t* unmapped_counter;
int64_t* multimapped_counter;
- //char um_code;
- GetReadProc(GBamWriter* bamw=NULL, int64_t* um_counter=NULL, int64_t* mm_counter=NULL):
- um_out(bamw), unmapped_counter(um_counter), multimapped_counter(mm_counter) { }
- virtual bool process(QReadData& rdata, bool& found, bool is_unmapped) {
+
+ int64_t* u_unmapped_counter;
+ int64_t* u_multimapped_counter;
+
+ //char um_code;
+ GetReadProc(GBamWriter* bamw=NULL, int64_t* um_counter=NULL, int64_t* mm_counter=NULL,
+ int64_t* u_um_counter=NULL, int64_t* u_mm_counter=NULL):
+ um_out(bamw), unmapped_counter(um_counter), multimapped_counter(mm_counter),
+ u_unmapped_counter(u_um_counter), u_multimapped_counter(u_mm_counter)
+ { }
+ virtual bool process(QReadData& rdata, bool& found) { //, bool is_unmapped) {
//should return True - if it returns FALSE it will cause getRead() to abort
//(stops looking for target readId in the stream) and to return false (="not found")
return true;
}
+ void writeUnmapped(QReadData& rdata) {
+ if (rdata.um_written) return;
+ string rname(rdata.read.alt_name);
+ size_t slash_pos=rname.rfind('/');
+ if (slash_pos!=string::npos && rname.length()-slash_pos<4)
+ rname.resize(slash_pos);
+ GBamRecord bamrec(rname.c_str(), -1, 0, false, rdata.read.seq.c_str(),
+ NULL, rdata.read.qual.c_str());
+ if (rdata.matenum) {
+ bamrec.set_flag(BAM_FPAIRED);
+ if (rdata.matenum==1) bamrec.set_flag(BAM_FREAD1);
+ else bamrec.set_flag(BAM_FREAD2);
+ }
+ //if (found && um_code && !rdata.trashCode) {
+ //rdata.trashCode=um_code;
+ //}
+ if (rdata.trashCode) {
+ //multi-mapped reads did not really QC-fail
+ //should also not be written to unmapped.bam
+ bamrec.add_aux("ZT", 'A', 1, (uint8_t*)&rdata.trashCode);
+ //if (rdata.trashCode!='M')
+ //bamrec.set_flag(BAM_FQCFAIL); //to be excluded from further processing?
+ }
+ um_out->write(&bamrec);
+ rdata.um_written=true;
+ }
+
virtual ~GetReadProc() { }
};
@@ -184,6 +232,7 @@ class ReadStream {
FLineReader* flquals;
FLineReader* flseqs;
bool stream_copy;
+ ReadFormat read_format; //should be guessed when opening the stream
bam1_t* b;
bool bam_alt_name; //from BAM files, look for alt_name tag to retrieve the original read name
bool bam_ignoreQC; //from BAM files, ignore QC flag (return the next read even if it has QC fail)
@@ -203,20 +252,20 @@ class ReadStream {
ReadOrdering > read_pq;
uint64_t last_id; //keep track of last requested ID, for consistency check
bool r_eof;
- bool next_read(QReadData& rdata, ReadFormat read_format=FASTQ); //get top read from the queue
+ bool next_read(QReadData& rdata); //get top read from the queue
public:
- ReadStream(int bufsize=READSTREAM_BUF_SIZE):flquals(NULL), flseqs(NULL), stream_copy(false), b(NULL),
+ ReadStream(int bufsize=READSTREAM_BUF_SIZE):flquals(NULL), flseqs(NULL), stream_copy(false), read_format(FASTX_AUTO), b(NULL),
bam_alt_name(false), bam_ignoreQC(false), fstream(), fquals(NULL),ReadBufSize(bufsize), read_pq(),
last_id(0), r_eof(false) { }
ReadStream(const string& fname, FZPipe* pquals=NULL, bool guess_packer=false):flquals(NULL),
- flseqs(NULL), stream_copy(false), b(NULL), bam_alt_name(false), bam_ignoreQC(false), fstream(),
+ flseqs(NULL), stream_copy(false), read_format(FASTX_AUTO), b(NULL), bam_alt_name(false), bam_ignoreQC(false), fstream(),
fquals(pquals), ReadBufSize(READSTREAM_BUF_SIZE), read_pq(), last_id(0), r_eof(false) {
init(fname, pquals, guess_packer);
}
ReadStream(FZPipe& f_stream, FZPipe* pquals=NULL):flquals(NULL),
- flseqs(NULL), stream_copy(true), b(NULL), bam_alt_name(false), bam_ignoreQC(false), fstream(f_stream),
+ flseqs(NULL), stream_copy(true), read_format(FASTX_AUTO), b(NULL), bam_alt_name(false), bam_ignoreQC(false), fstream(f_stream),
fquals(pquals), ReadBufSize(READSTREAM_BUF_SIZE), read_pq(), last_id(0), r_eof(false) {
//init(f_stream, pquals);
if (fstream.is_bam) {
@@ -224,7 +273,10 @@ class ReadStream {
}
else {
flseqs=new FLineReader(fstream.file);
- skip_lines(*flseqs);
+ read_format=skip_lines(*flseqs);
+ if (read_format==FASTX_AUTO)
+ warn_msg("Warning: could not determine format for file '%s'! (large header?)\n", fstream.filename.c_str());
+
}
fquals=pquals;
if (fquals) {
@@ -239,29 +291,33 @@ class ReadStream {
void ignoreQC(bool v=true) {
bam_ignoreQC=v;
}
-
+ ReadFormat get_format() { return read_format; }
void init(const string& fname, FZPipe* pquals=NULL, bool guess_packer=false) {
- if (fname.empty()) return;
- if (fstream.openRead(fname, guess_packer)==NULL) {
+ if (fname.empty()) return;
+ if (fstream.openRead(fname, guess_packer)==NULL) {
fprintf(stderr, "Warning: couldn't open file %s\n",fname.c_str());
return;
- }
- if (fstream.is_bam) {
+ }
+ if (fstream.is_bam) {
if (b==NULL) {
b = bam_init1();
}
- }
- else {
+ }
+ else {
if (b) { bam_destroy1(b); b=NULL; }
flseqs=new FLineReader(fstream.file);
- skip_lines(*flseqs);
- }
- fquals=pquals;
- if (fquals) {
+ read_format=skip_lines(*flseqs);
+ if (read_format==FASTX_AUTO)
+ warn_msg("Warning: could not determine format for file '%s'! (large header?)\n", fstream.filename.c_str());
+
+ }
+ fquals=pquals;
+ if (fquals) {
flquals=new FLineReader(fquals->file);
skip_lines(*flquals);
- }
- }
+ }
+ }
+
void init(FZPipe& f_stream, FZPipe* pquals=NULL) {
fstream=f_stream; //Warning - original copy may end up with invalid (closed) file handle
stream_copy=true;
@@ -277,17 +333,19 @@ class ReadStream {
else {
if (b) { bam_destroy1(b); b=NULL; }
flseqs=new FLineReader(fstream.file);
- skip_lines(*flseqs);
+ read_format=skip_lines(*flseqs);
+ if (read_format==FASTX_AUTO)
+ warn_msg("Warning: could not determine format for file '%s'! (large header?)\n", f_stream.filename.c_str());
}
fquals=pquals;
if (fquals) {
flquals=new FLineReader(fquals->file);
skip_lines(*flquals);
- }
}
+ }
//unbuffered reading from stream
- bool get_direct(Read& read, ReadFormat read_format=FASTQ);
+ bool get_direct(Read& read);
bool isBam() { return fstream.is_bam; }
bam1_t* last_b() {//return the latest SAM record data fetched by get_direct()
//must only be called after get_direct()
@@ -300,7 +358,7 @@ class ReadStream {
//read_ids must ALWAYS be requested in increasing order
bool getRead(uint64_t read_id, Read& read,
- ReadFormat read_format=FASTQ,
+ //ReadFormat read_format=FASTQ,
bool strip_slash=false,
uint64_t begin_id = 0,
uint64_t end_id=std::numeric_limits<uint64_t>::max(),
@@ -314,13 +372,22 @@ class ReadStream {
int64_t* multimapped_counter=NULL
*/
);
+ bool getQRead(uint64_t read_id,
+ QReadData& read,
+ uint64_t begin_id = 0,
+ uint64_t end_id=std::numeric_limits<uint64_t>::max(),
+ GetReadProc* rProc=NULL,
+ bool is_unmapped=false //the target read, when found is also written by
+ //rProc into the unmapped BAM file
+ );
+
void rewind() {
fstream.rewind();
clear();
if (flseqs) {
flseqs->reset(fstream);
- skip_lines(*flseqs);
+ skip_lines(*flseqs); //read_format detected earlier
}
if (flquals) {
flquals->reset(*fquals);
diff --git a/src/sam_juncs.cpp b/src/sam_juncs.cpp
index ea80e12..4c21079 100644
--- a/src/sam_juncs.cpp
+++ b/src/sam_juncs.cpp
@@ -118,8 +118,6 @@ int main(int argc, char** argv)
fprintf(stderr, "sam_juncs v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
fprintf(stderr, "---------------------------------------\n");
- reads_format = FASTQ;
-
int parse_ret = parse_options(argc, argv, print_usage);
if (parse_ret)
return parse_ret;
diff --git a/src/segment_juncs.cpp b/src/segment_juncs.cpp
index 80fb3b7..e3acc46 100644
--- a/src/segment_juncs.cpp
+++ b/src/segment_juncs.cpp
@@ -450,23 +450,7 @@ void count_read_mers(string& reads_file, size_t half_splice_mer_len)
size_t mer_table_size = 1 << ((splice_mer_len)<<1);
extension_counts.resize(mer_table_size);
ReadStream readstream(reads_file);
- //FLineReader fr(reads_file);
- //while(!feof(reads_file))
- while (readstream.get_direct(read, reads_format)) {
- /*
- while (!fr.isEof())
- {
- read.clear();
-
- // Get the next read from the file
- if (!next_fasta_record(fr, read.name, read.seq, reads_format))
- break;
- if (reads_format == FASTQ)
- {
- if (!next_fastq_record(fr, read.seq, read.alt_name, read.qual, reads_format))
- break;
- }
- */
+ while (readstream.get_direct(read)) {
if (color && !readstream.isBam())
// erase the primer and the adjacent color
read.seq.erase(0, 2);
@@ -531,23 +515,7 @@ void store_read_mers(string& reads_file, size_t half_splice_mer_len)
size_t num_indexed_reads = 0;
ReadStream readstream(reads_file);
- while (readstream.get_direct(read, reads_format)) {
- /*
- FLineReader fr(reads_file);
- //while(!feof(reads_file))
- while(!fr.isEof())
- {
- read.clear();
-
- // Get the next read from the file
- if (!next_fasta_record(fr, read.name, read.seq, reads_format))
- break;
- if (reads_format == FASTQ)
- {
- if (!next_fastq_record(fr, read.seq, read.alt_name, read.qual, reads_format))
- break;
- }
- */
+ while (readstream.get_direct(read)) {
if (color && !readstream.isBam())
// erase the primer and the adjacent color
read.seq.erase(0, 2);
@@ -2530,7 +2498,7 @@ void detect_small_insertion(RefSequenceTable& rt,
bestInsertPosition + discrepancy <= (int)length(left_read_sequence)){
String<char> insertedSequence = seqan::infix(left_read_sequence, bestInsertPosition, bestInsertPosition + discrepancy);
if(color)
- insertedSequence = convert_color_to_bp(genomic_sequence_temp[bestInsertPosition - begin_offset + end_offset - 1], insertedSequence);
+ insertedSequence = convert_color_to_bp(genomic_sequence_temp[bestInsertPosition - begin_offset + end_offset - 1], insertedSequence);
insertions.insert(Insertion(leftHit.ref_id(),
leftHit.left() + bestInsertPosition - 1 + end_offset,
seqan::toCString(insertedSequence)));
diff --git a/src/tophat-fusion-post b/src/tophat-fusion-post
index 2c13592..20a8f44 100755
--- a/src/tophat-fusion-post
+++ b/src/tophat-fusion-post
@@ -311,6 +311,7 @@ def map_fusion_kmer(bwt_idx_prefix, params, sample_update = False):
fusion_kmer_file_name = output_dir + "fusion_seq.map"
if not os.path.exists(fusion_kmer_file_name) or \
+ os.stat(fusion_kmer_file_name).st_size <= 0 or \
sample_update:
get_fusion_seq()
cmd = ['bowtie', '-p', '8', '-a', '-n', '3', '-m', '100', bwt_idx_prefix, '-f', '%sfusion_seq.fa' % output_dir]
@@ -1897,10 +1898,6 @@ def generate_html(params):
for i in range(min(max_num_fusions, len(cluster_temp_list))):
do_not_add = False
indices = cluster_temp_list[i]["index"]
-
- if num_samples(indices) > 5:
- do_not_add = True
-
if not do_not_add:
def cmp(a, b):
return int(fusion_list[b]["score"] - fusion_list[a]["score"])
@@ -2588,7 +2585,7 @@ def prog_path(program):
def get_version():
- return "2.0.9"
+ return "2.1.0"
def main(argv=None):
diff --git a/src/tophat.py b/src/tophat.py
index d1685c5..94f5848 100755
--- a/src/tophat.py
+++ b/src/tophat.py
@@ -415,7 +415,7 @@ class TopHatParams:
color,
library_type,
seed_length,
- reads_format,
+ # reads_format,
mate_inner_dist,
mate_inner_dist_std_dev,
read_group_id,
@@ -433,7 +433,7 @@ class TopHatParams:
self.color = color
self.library_type = library_type
self.seed_length = seed_length
- self.reads_format = reads_format
+ # self.reads_format = reads_format
self.mate_inner_dist = mate_inner_dist
self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
self.read_group_id = read_group_id
@@ -682,7 +682,7 @@ class TopHatParams:
False, # SOLiD - color space
"", # library type (e.g. "illumina-stranded-pair-end")
None, # seed_length
- "fastq", # quality_format
+ # "fastq", # quality_format
None, # mate inner distance
20, # mate inner dist std dev
None, # read group id
@@ -714,6 +714,7 @@ class TopHatParams:
self.transcriptome_only = False
self.transcriptome_index = None
self.transcriptome_outdir = None
+ self.transcriptome_buildonly = None
self.raw_junctions = None
self.resume_dir = None
self.find_novel_juncs = True
@@ -1138,7 +1139,10 @@ class TopHatParams:
tmp_dir = custom_tmp_dir
sam_header = tmp_dir + "stub_header.sam"
if len(args) < 2 and not self.resume_dir:
- raise Usage(use_message)
+ if len(args) == 1 and self.transcriptome_index and self.gff_annotation:
+ self.transcriptome_buildonly = True
+ else:
+ raise Usage(use_message)
if self.read_realign_edit_dist == None:
self.read_realign_edit_dist = self.read_edit_dist + 1
@@ -1816,7 +1820,7 @@ class ZWriter:
# to determines the file format
def check_reads_format(params, reads_files):
#seed_len = params.read_params.seed_length
- fileformat = params.read_params.reads_format
+ #fileformat = params.read_params.reads_format
observed_formats = set([])
# observed_scales = set([])
@@ -1842,25 +1846,27 @@ def check_reads_format(params, reads_files):
max_seed_len = max(seq_len, max_seed_len)
zf.close()
observed_formats.add(freader.format)
- if len(observed_formats) > 1:
- die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
+# if len(observed_formats) > 1:
+# die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
fileformat=list(observed_formats)[0]
- #if seed_len != None:
- # seed_len = max(seed_len, max_seed_len)
- #else:
- # seed_len = max_seed_len
- #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
- th_logp("\tformat:\t\t %s" % fileformat)
- if fileformat == "fastq":
- quality_scale = "phred33 (default)"
- if params.read_params.solexa_quals and not params.read_params.phred64_quals:
- quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
- elif params.read_params.phred64_quals:
- quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
- th_logp("\tquality scale:\t %s" % quality_scale)
- elif fileformat == "fasta":
- if params.read_params.color:
- params.read_params.integer_quals = True
+# #if seed_len != None:
+# # seed_len = max(seed_len, max_seed_len)
+# #else:
+# # seed_len = max_seed_len
+# #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
+# th_logp("\tformat:\t\t %s" % fileformat)
+# if fileformat == "fastq":
+# quality_scale = "phred33 (default)"
+# if params.read_params.solexa_quals and not params.read_params.phred64_quals:
+# quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
+# elif params.read_params.phred64_quals:
+# quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
+# th_logp("\tquality scale:\t %s" % quality_scale)
+# elif fileformat == "fasta":
+# if params.read_params.color:
+# params.read_params.integer_quals = True
+ if params.read_params.color and fileformat == "fasta":
+ params.read_params.integer_quals = True
#print seed_len, format, solexa_scale
#NOTE: seed_len will be re-evaluated later by prep_reads
@@ -1870,9 +1876,9 @@ def check_reads_format(params, reads_files):
params.read_params.integer_quals,
params.read_params.color,
params.read_params.library_type,
- #seed_len,
+ # seed_len,
params.read_params.seed_length,
- fileformat,
+ # fileformat,
params.read_params.mate_inner_dist,
params.read_params.mate_inner_dist_std_dev,
params.read_params.read_group_id,
@@ -1970,10 +1976,10 @@ def prep_reads_cmd(params, l_reads_list, l_quals_list=None, r_reads_list=None, r
prep_cmd.extend(params.cmd())
- if params.read_params.reads_format == "fastq":
- prep_cmd += ["--fastq"]
- elif params.read_params.reads_format == "fasta":
- prep_cmd += ["--fasta"]
+# if params.read_params.reads_format == "fastq":
+# prep_cmd += ["--fastq"]
+# elif params.read_params.reads_format == "fasta":
+# prep_cmd += ["--fasta"]
if hits_to_filter:
prep_cmd += ["--flt-hits=" + ",".join(hits_to_filter)]
if aux_file:
@@ -2094,7 +2100,7 @@ def bowtie(params,
bwt_idx_prefix,
sam_headers,
reads_list,
- reads_format,
+ # reads_format,
num_mismatches,
gap_length,
edit_dist,
@@ -2174,10 +2180,10 @@ def bowtie(params,
# Launch Bowtie
try:
bowtie_cmd = [bowtie_path]
- if reads_format == "fastq":
- bowtie_cmd += ["-q"]
- elif reads_format == "fasta":
- bowtie_cmd += ["-f"]
+# if reads_format == "fastq":
+# bowtie_cmd += ["-q"]
+# elif reads_format == "fasta":
+# bowtie_cmd += ["-f"]
if params.read_params.color:
bowtie_cmd += ["-C", "--col-keepends"]
@@ -2188,8 +2194,8 @@ def bowtie(params,
unzip_cmd=[ prog_path('bam2fastx'), "--all" ]
if params.read_params.color:
unzip_cmd.append("--color")
- if reads_format:
- unzip_cmd.append("--" + reads_format)
+ #if reads_format:
+ # unzip_cmd.append("--" + reads_format)
unzip_cmd+=[reads_list[0]]
if use_zpacker and (unzip_cmd is None):
@@ -2240,7 +2246,8 @@ def bowtie(params,
if not params.bowtie2:
fix_map_cmd += ["--bowtie1"]
if multihits_out != None:
- fix_map_cmd += ["--aux-outfile", params.preflt_data[multihits_out].multihit_reads]
+ if params.bowtie2:
+ fix_map_cmd += ["--aux-outfile", params.preflt_data[multihits_out].multihit_reads]
fix_map_cmd += ["--max-multihits", str(params.max_hits)]
if t_mapping:
out_bam = "-" # we'll pipe into map2gtf
@@ -3046,7 +3053,6 @@ def junctions_from_segments(params,
right_reads_map,
right_seg_maps,
unmapped_reads,
- reads_format,
ref_fasta):
# if left_reads_map != left_seg_maps[0]:
@@ -3276,18 +3282,19 @@ def map2gtf(params, genome_sam_header_filename, ref_fasta, left_reads, right_rea
m2g_bwt_idx = params.transcriptome_index
th_log("Using pre-built transcriptome data..")
else:
- th_log("Building transcriptome data files..")
if params.transcriptome_outdir:
t_out_dir=params.transcriptome_outdir+"/"
+ th_log("Building transcriptome data files "+t_out_dir+gtf_name)
m2g_ref_name = t_out_dir + gtf_name
m2g_ref_fasta = gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_name)
m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color)
params.transcriptome_index = m2g_bwt_idx
-
+ if params.transcriptome_buildonly:
+ return
transcriptome_header_filename = get_index_sam_header(params, m2g_bwt_idx)
mapped_gtf_list = []
- unmapped_gtf_list = []
+ unmapped_gtf_list = []
# do the initial mapping in GTF coordinates
for reads in [left_reads, right_reads]:
if reads == None or os.path.getsize(reads) < 25 :
@@ -3305,7 +3312,6 @@ def map2gtf(params, genome_sam_header_filename, ref_fasta, left_reads, right_rea
m2g_bwt_idx,
[transcriptome_header_filename, genome_sam_header_filename],
[reads],
- "fastq",
params.read_mismatches,
params.read_gap_length,
params.read_edit_dist,
@@ -3508,7 +3514,6 @@ def spliced_alignment(params,
bwt_idx_prefix,
sam_header_filename,
[reads],
- "fastq",
params.read_mismatches,
params.read_gap_length,
params.read_edit_dist,
@@ -3546,7 +3551,6 @@ def spliced_alignment(params,
bwt_idx_prefix,
sam_header_filename,
[seg],
- "fastq",
params.segment_mismatches,
params.segment_mismatches,
params.segment_mismatches,
@@ -3604,7 +3608,6 @@ def spliced_alignment(params,
right_reads_map,
right_seg_maps,
unmapped_reads,
- "fastq",
ref_fasta)
if not params.system_params.keep_tmp:
@@ -3704,7 +3707,6 @@ def spliced_alignment(params,
tmp_dir + junc_idx_prefix,
juncs_bwt_samheader,
[seg],
- "fastq",
params.segment_mismatches,
params.segment_mismatches,
params.segment_mismatches,
@@ -3837,7 +3839,9 @@ def main(argv=None):
params.check()
bwt_idx_prefix = args[0]
- left_reads_list = args[1]
+ left_reads_list = None
+ if len(args)>1:
+ left_reads_list = args[1]
left_quals_list, right_quals_list = None, None
if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
if params.read_params.mate_inner_dist == None:
@@ -3854,6 +3858,7 @@ def main(argv=None):
left_quals_list = args[2]
start_time = datetime.now()
+
prepare_output_dir()
init_logger(logging_dir + "tophat.log")
@@ -3861,7 +3866,10 @@ def main(argv=None):
if resumeStage>0:
th_log("Resuming TopHat run in directory '"+output_dir+"' stage '"+stageNames[resumeStage]+"'")
else:
- th_log("Beginning TopHat run (v"+get_version()+")")
+ if params.transcriptome_buildonly:
+ th_log("Building transcriptome files with TopHat v"+get_version())
+ else:
+ th_log("Beginning TopHat run (v"+get_version()+")")
th_logp("-----------------------------------------------")
global run_log
@@ -3913,7 +3921,11 @@ def main(argv=None):
#end @ transcriptome_index given
(ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix, params.bowtie2)
-
+ if params.transcriptome_buildonly:
+ map2gtf(params, "", ref_fasta, [], [])
+ th_logp("-----------------------------------------------")
+ th_log("Transcriptome files prepared. This was the only task requested.")
+ return
if currentStage >= resumeStage:
th_log("Generating SAM header for "+bwt_idx_prefix)
# we need to provide another name for this sam header as genome and transcriptome may have the same prefix.
@@ -3978,7 +3990,7 @@ def main(argv=None):
th_log("Pre-filtering multi-mapped "+sides[ri]+" reads")
rdlist=reads_list.split(',')
bwt=bowtie(params, bwt_idx_prefix, sam_header_filename, rdlist,
- params.read_params.reads_format,
+ # params.read_params.reads_format,
params.read_mismatches,
params.read_gap_length,
params.read_edit_dist,
@@ -4059,7 +4071,7 @@ def main(argv=None):
th_logp("-----------------------------------------------")
- th_log("A summary of the alignment counts can be found in %salign_summary.txt" % output_dir);
+ th_log("A summary of the alignment counts can be found in %salign_summary.txt" % output_dir)
th_log("Run complete: %s elapsed" % formatTD(duration))
except Usage, err:
diff --git a/src/tophat_reports.cpp b/src/tophat_reports.cpp
index 9d0a927..91e39df 100644
--- a/src/tophat_reports.cpp
+++ b/src/tophat_reports.cpp
@@ -126,38 +126,34 @@ char read_best_alignments(const HitsForRead& hits_for_read,
ret_code |= 16;
return ret_code; //too many hits
}*/
- unsigned int nhits=0;
+ unsigned int nhits=0;
for (size_t i = 0; i < hits.size(); ++i)
{
- if (hits[i].mismatches() > read_mismatches ||
- hits[i].gap_length() > read_gap_length ||
- hits[i].edit_dist() > read_edit_dist)
- continue;
- ret_code |= 1; //read has a valid mapping
- nhits++;
- if (nhits>1) ret_code |= 4; //read has multiple valid mappings
- if (nhits>max_multihits) ret_code |= 16; //read has too many valid mappings
- BowtieHit hit = hits[i];
- AlignStatus align_status(hit, gtf_junctions,
- junctions, insertions, deletions, fusions, coverage);
- hit.alignment_score(align_status._alignment_score);
- if (report_secondary_alignments || !final_report)
+ if (hits[i].mismatches() > read_mismatches ||
+ hits[i].gap_length() > read_gap_length ||
+ hits[i].edit_dist() > read_edit_dist)
+ continue;
+ BowtieHit hit = hits[i];
+ AlignStatus align_status(hit, gtf_junctions,
+ junctions, insertions, deletions, fusions, coverage);
+ hit.alignment_score(align_status._alignment_score);
+ if (report_secondary_alignments || !final_report)
+ {
+ best_hits.hits.push_back(hit);
+ }
+ else
+ {
+ // Is the new status better than the current best one?
+ if (best_hits.hits.size() == 0 || cmp_read_alignment()(hit, best_hits.hits[0]))
{
+ best_hits.hits.clear();
best_hits.hits.push_back(hit);
}
- else
+ else if (!cmp_read_alignment()(best_hits.hits[0], hit)) // is it just as good?
{
- // Is the new status better than the current best one?
- if (best_hits.hits.size() == 0 || cmp_read_alignment()(hit, best_hits.hits[0]))
- {
- best_hits.hits.clear();
- best_hits.hits.push_back(hit);
- }
- else if (!cmp_read_alignment()(best_hits.hits[0], hit)) // is it just as good?
- {
- best_hits.hits.push_back(hit);
- }
+ best_hits.hits.push_back(hit);
}
+ }
}
// due to indel alignments, there may be alignments with the same location
@@ -167,16 +163,16 @@ char read_best_alignments(const HitsForRead& hits_for_read,
if ((report_secondary_alignments || !final_report) && best_hits.hits.size() > 0)
{
- sort(best_hits.hits.begin(), best_hits.hits.end(), cmp_read_alignment());
+ sort(best_hits.hits.begin(), best_hits.hits.end(), cmp_read_alignment());
}
- if (final_report)
- {
- if (suppress_hits && best_hits.hits.size() > max_multihits)
+ if (final_report) {
+ if (best_hits.hits.size() > max_multihits) {
+ ret_code |= 16; //read has too many valid mappings
+ if (suppress_hits)
best_hits.hits.clear();
-
- if (best_hits.hits.size() > max_multihits)
- {
+ else {
+ //if (best_hits.hits.size() > max_multihits)
// there may be several alignments with the same alignment scores,
// all of which we can not include because of this limit.
// let's pick up some of them randomly up to this many max multihits.
@@ -208,6 +204,11 @@ char read_best_alignments(const HitsForRead& hits_for_read,
best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end());
}
+ }
+ if ((nhits=best_hits.hits.size())>0) {
+ ret_code |= 1; //read has a valid mapping
+ if (nhits>1) ret_code |= 4; //read has multiple mappings
+ }
}
return ret_code;
}
@@ -314,16 +315,25 @@ struct SAlignStats {
int64_t num_unmapped_left; //total left reads unmapped, or mapped in too many places (!)
int64_t num_aligned_left_multi; //total left reads mapped in more than 1 place
int64_t num_aligned_left_xmulti; //total left reads mapped in too many places (> max_multihits)
+
int64_t num_aligned_right; //total right reads aligned
int64_t num_unmapped_right; //total right reads unmapped, or mapped in too many places (!)
int64_t num_aligned_right_multi; //total right reads in more than 1 place
int64_t num_aligned_right_xmulti; //total right reads mapped in too many places (> max_multihits)
+
int64_t num_aligned_pairs; //total pairs aligned
int64_t num_aligned_pairs_multi; //total pairs aligned in more than 1 place
int64_t num_aligned_pairs_disc; //total pairs aligned discordantly only
+
+ int64_t num_aligned_unpaired; //total unpaired reads aligned, when mixed with PE
+ int64_t num_unmapped_unpaired; //total right reads unmapped, or mapped in too many places (!)
+ int64_t num_aligned_unpaired_multi;
+ int64_t num_aligned_unpaired_xmulti; //total right reads mapped in too many places (> max_multihits)
+
SAlignStats():num_aligned_left(0), num_unmapped_left(0), num_aligned_left_multi(0), num_aligned_left_xmulti(0), num_aligned_right(0),
num_unmapped_right(0), num_aligned_right_multi(0), num_aligned_right_xmulti(0), num_aligned_pairs(0), num_aligned_pairs_multi(0),
- num_aligned_pairs_disc(0) { }
+ num_aligned_pairs_disc(0), num_aligned_unpaired(0), num_unmapped_unpaired(0),
+ num_aligned_unpaired_multi(0), num_aligned_unpaired_xmulti(0) { }
void add(SAlignStats& a) {
num_aligned_left+=a.num_aligned_left;
num_unmapped_left+=a.num_unmapped_left;
@@ -336,6 +346,10 @@ struct SAlignStats {
num_aligned_pairs+=a.num_aligned_pairs;
num_aligned_pairs_multi+=a.num_aligned_pairs_multi;
num_aligned_pairs_disc+=a.num_aligned_pairs_disc;
+ num_aligned_unpaired+=a.num_aligned_unpaired;
+ num_unmapped_unpaired+=a.num_unmapped_unpaired;
+ num_aligned_unpaired_multi+=a.num_aligned_unpaired_multi;
+ num_aligned_unpaired_xmulti+=a.num_aligned_unpaired_xmulti;
}
};
@@ -367,7 +381,7 @@ char pair_best_alignments(const HitsForRead& left_hits,
if (ret_code)
return ret_code;
*/
- unsigned int l_nhits=0, r_nhits=0;
+ unsigned int nhits=0;
vector<BowtieHit> rhits;
@@ -377,34 +391,25 @@ char pair_best_alignments(const HitsForRead& left_hits,
right[j].gap_length() > read_gap_length ||
right[j].edit_dist() > read_edit_dist)
continue;
- r_nhits++;
- ret_code|=2; //right read has acceptable mappings
- if (r_nhits>1) ret_code |= 8; //right read has multiple valid mappings
- if (r_nhits>max_multihits) {
- ret_code |= 32; //left read has too many mappings
- //if (r_nhits > (max_multihits<<2))
- break;
- }
- BowtieHit rh = right[j];
- AlignStatus align_status(rh, gtf_junctions,
- junctions, insertions, deletions, fusions, coverage);
- rh.alignment_score(align_status._alignment_score);
- rhits.push_back(rh);
+ nhits++;
+ if ((nhits>>1) > max_multihits)
+ break;
+ BowtieHit rh = right[j];
+ AlignStatus align_status(rh, gtf_junctions,
+ junctions, insertions, deletions, fusions, coverage);
+ rh.alignment_score(align_status._alignment_score);
+ rhits.push_back(rh);
}
-
+ nhits=0;
for (size_t i = 0; i < left.size(); ++i)
{
if (left[i].mismatches() > read_mismatches ||
left[i].gap_length() > read_gap_length ||
left[i].edit_dist() > read_edit_dist)
continue;
- l_nhits++;
- ret_code|=1; //left read has acceptable mappings
- if (l_nhits>1) ret_code |= 4; //left read has multiple valid mappings
- if (l_nhits>max_multihits) {
- ret_code |= 16; //left read has too many mappings
- //if (l_nhits > (max_multihits<<2))
- break;
+ nhits++;
+ if ((nhits>>1)>max_multihits) {
+ break;
}
BowtieHit lh = left[i];
AlignStatus align_status(lh, gtf_junctions,
@@ -516,8 +521,15 @@ if (lh.insert_id() == 10790262)
if (final_report)
{
- if (suppress_hits && best_hits.size() > max_multihits)
- best_hits.clear();
+ nhits = best_hits.size();
+ if (nhits>0) {
+ ret_code|=3; //both reads have acceptable mappings
+ }
+ if (nhits>1) ret_code |= 12; //both reads have multiple valid mappings
+ if (nhits>max_multihits) {
+ ret_code |= 48; //both reads have too many mappings
+ if (suppress_hits) best_hits.clear();
+ }
if (best_hits.size() > max_multihits)
{
@@ -597,37 +609,37 @@ void add_auxData(vector<string>& auxdata, vector<string>& sam_toks,
// requirement, and that the strand indicated by the alignment is consistent
// with the orientation of the splices (though that should be handled upstream).
if (!XS_found) {
- const string xs_f("XS:A:+");
- const string xs_r("XS:A:-");
- if (library_type == FR_FIRSTSTRAND) {
- if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
- if (bh.antisense_align())
- auxdata.push_back(xs_f);
- else
- auxdata.push_back(xs_r);
- }
- else {
- if (bh.antisense_align())
- auxdata.push_back(xs_r);
- else
- auxdata.push_back(xs_f);
- }
- }
- else if (library_type == FR_SECONDSTRAND) {
- if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
- if (bh.antisense_align())
- auxdata.push_back(xs_r);
- else
- auxdata.push_back(xs_f);
- }
- else
- {
- if (bh.antisense_align())
- auxdata.push_back(xs_f);
- else
- auxdata.push_back(xs_r);
- }
- }
+ const string xs_f("XS:A:+");
+ const string xs_r("XS:A:-");
+ if (library_type == FR_FIRSTSTRAND) {
+ if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
+ if (bh.antisense_align())
+ auxdata.push_back(xs_f);
+ else
+ auxdata.push_back(xs_r);
+ }
+ else {
+ if (bh.antisense_align())
+ auxdata.push_back(xs_r);
+ else
+ auxdata.push_back(xs_f);
+ }
+ }
+ else if (library_type == FR_SECONDSTRAND) {
+ if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
+ if (bh.antisense_align())
+ auxdata.push_back(xs_r);
+ else
+ auxdata.push_back(xs_f);
+ }
+ else
+ {
+ if (bh.antisense_align())
+ auxdata.push_back(xs_f);
+ else
+ auxdata.push_back(xs_r);
+ }
+ }
}
if (hitIndex >= 0)
{
@@ -2106,16 +2118,17 @@ void print_alnStats(SAlignStats& alnStats) {
FILE* f = fopen(fname.c_str(), "w");
int64_t total_left=alnStats.num_aligned_left+alnStats.num_unmapped_left;
int64_t total_right=alnStats.num_aligned_right+alnStats.num_unmapped_right;
+ int64_t total_unpaired=alnStats.num_aligned_unpaired+alnStats.num_unmapped_unpaired;
//int64_t accepted_left =alnStats.num_aligned_left -alnStats.num_aligned_left_xmulti; //accepted mappings, < max_multihits
//int64_t accepted_right=alnStats.num_aligned_right-alnStats.num_aligned_right_xmulti; //accepted right mappings
string rdn("Left reads");
if (total_right==0) rdn="Reads";
fprintf(f, "%s:\n", rdn.c_str());
- fprintf(f, " Input: %9ld\n", total_left);
+ fprintf(f, " Input : %9ld\n", total_left);
double perc=(100.0*alnStats.num_aligned_left)/total_left;
- fprintf(f, " Mapped: %9ld (%4.1f%% of input)\n",
+ fprintf(f, " Mapped : %9ld (%4.1f%% of input)\n",
alnStats.num_aligned_left, perc);
- if (alnStats.num_aligned_left) {
+ if (alnStats.num_aligned_left && alnStats.num_aligned_left_multi>0) {
perc=(100.0*alnStats.num_aligned_left_multi)/alnStats.num_aligned_left;
fprintf(f," of these: %9ld (%4.1f%%) have multiple alignments (%ld have >%d)\n",
alnStats.num_aligned_left_multi, perc, alnStats.num_aligned_left_xmulti, max_multihits);
@@ -2129,33 +2142,48 @@ void print_alnStats(SAlignStats& alnStats) {
int64_t total_pairs=0;
if (total_right) {
fprintf(f, "Right reads:\n");
- fprintf(f, " Input: %9ld\n", total_right);
+ fprintf(f, " Input : %9ld\n", total_right);
total_input+=total_right;
perc=(100.0*alnStats.num_aligned_right)/total_right;
- fprintf(f, " Mapped: %9ld (%4.1f%% of input)\n",
+ fprintf(f, " Mapped : %9ld (%4.1f%% of input)\n",
alnStats.num_aligned_right, perc);
- if (alnStats.num_aligned_right) {
+ if (alnStats.num_aligned_right && alnStats.num_aligned_right_multi>0) {
perc=(100.0* alnStats.num_aligned_right_multi)/alnStats.num_aligned_right;
fprintf(f," of these: %9ld (%4.1f%%) have multiple alignments (%ld have >%d)\n",
alnStats.num_aligned_right_multi, perc, alnStats.num_aligned_right_xmulti, max_multihits);
}
-/* perc=(100.0*accepted_right)/total_left;
- fprintf(f, " Mapped acceptably: %9ld (%4.1f%% of input)\n",
- accepted_right, perc);
-*/
total_mapped+=alnStats.num_aligned_right;
total_pairs=(total_right<total_left)? total_right : total_left;
+
+ if (total_unpaired) {
+ fprintf(f, "Unpaired reads:\n");
+ fprintf(f, " Input : %9ld\n", total_unpaired);
+ total_input+=total_unpaired;
+ perc=(100.0*alnStats.num_aligned_unpaired)/total_unpaired;
+ fprintf(f, " Mapped : %9ld (%4.1f%% of input)\n",
+ alnStats.num_aligned_unpaired, perc);
+ if (alnStats.num_aligned_unpaired && alnStats.num_aligned_unpaired_multi>0) {
+ perc=(100.0* alnStats.num_aligned_unpaired_multi)/alnStats.num_aligned_unpaired;
+ fprintf(f," of these: %9ld (%4.1f%%) have multiple alignments (%ld have >%d)\n",
+ alnStats.num_aligned_unpaired_multi, perc, alnStats.num_aligned_unpaired_xmulti, max_multihits);
+ }
+ total_mapped+=alnStats.num_aligned_unpaired;
+ }
}
perc=(100.0*total_mapped)/total_input;
- fprintf(f, "%4.1f%% overall read alignment rate.\n", perc);
+ fprintf(f, "%4.1f%% overall read mapping rate.\n", perc);
if (alnStats.num_aligned_pairs) {
fprintf(f, "\nAligned pairs: %9ld\n", alnStats.num_aligned_pairs);
- perc=(100.0*alnStats.num_aligned_pairs_multi)/alnStats.num_aligned_pairs;
- fprintf(f, " of these: %9ld (%4.1f%%) have multiple alignments\n",
+ if (alnStats.num_aligned_pairs_multi>0) {
+ perc=(100.0*alnStats.num_aligned_pairs_multi)/alnStats.num_aligned_pairs;
+ fprintf(f, " of these: %9ld (%4.1f%%) have multiple alignments\n",
alnStats.num_aligned_pairs_multi, perc);
- perc=(100.0*alnStats.num_aligned_pairs_disc)/alnStats.num_aligned_pairs;
- fprintf(f, " and: %9ld (%4.1f%%) are discordant alignments\n",
+ }
+ if (alnStats.num_aligned_pairs_disc>0) {
+ perc=(100.0*alnStats.num_aligned_pairs_disc)/alnStats.num_aligned_pairs;
+ fprintf(f, " %9ld (%4.1f%%) are discordant alignments\n",
alnStats.num_aligned_pairs_disc, perc);
+ }
perc=(100.0*(alnStats.num_aligned_pairs-alnStats.num_aligned_pairs_disc))/total_pairs;
fprintf(f, "%4.1f%% concordant pair alignment rate.\n", perc);
}
@@ -2163,43 +2191,24 @@ void print_alnStats(SAlignStats& alnStats) {
}
struct CReadProc: public GetReadProc {
- CReadProc(GBamWriter* bamw=NULL, int64_t* um_counter=NULL, int64_t* mm_counter=NULL):
- GetReadProc(bamw, um_counter, mm_counter) {}
- virtual bool process(QReadData& rdata, bool& found, bool is_unmapped) {
- //if (um_out && ((um_code && found) || !found )) {
- string rname(rdata.read.alt_name);
- //-- DEBUG
- //fprintf(stderr, ">WUM\t%c\t%s\t%c\t/%d\n", found?'F':'U',
- // rname.c_str(), (um_code && found) ? um_code: '-', rdata.matenum);
- //-- DEBUG.
- size_t slash_pos=rname.rfind('/');
- if (slash_pos!=string::npos)
- rname.resize(slash_pos);
- GBamRecord bamrec(rname.c_str(), -1, 0, false, rdata.read.seq.c_str(),
- NULL, rdata.read.qual.c_str());
- if (rdata.matenum) {
- bamrec.set_flag(BAM_FPAIRED);
- if (rdata.matenum==1) bamrec.set_flag(BAM_FREAD1);
- else bamrec.set_flag(BAM_FREAD2);
- }
- //if (found && um_code && !rdata.trashCode) {
- //rdata.trashCode=um_code;
- //}
- if (rdata.trashCode) {
- //multi-mapped reads did not really QC-fail
- //should also not be written to unmapped.bam
- bamrec.add_aux("ZT", 'A', 1, (uint8_t*)&rdata.trashCode);
- if (rdata.trashCode!='M') {
- bamrec.set_flag(BAM_FQCFAIL); //to be excluded from further processing?
- }
- }
- if (is_unmapped || (rdata.trashCode!='M' && !found)) {
- um_out->write(&bamrec);
+ CReadProc(GBamWriter* bamw=NULL, int64_t* um_counter=NULL, int64_t* mm_counter=NULL,
+ int64_t* u_um_counter=NULL, int64_t* u_mm_counter=NULL):
+ GetReadProc(bamw, um_counter, mm_counter, u_um_counter, u_mm_counter) {}
+ virtual bool process(QReadData& rdata, bool& found) { //, bool is_unmapped) {
+ //if (is_unmapped || (rdata.trashCode!='M' && !found)) {
+ if (rdata.trashCode!='M' && !found) {
+ this->writeUnmapped(rdata);
}
//
if (unmapped_counter && !found) {
- if (rdata.trashCode!='M') (*unmapped_counter)++;
- else if (multimapped_counter) (*multimapped_counter)++;
+ if (rdata.trashCode!='M') {
+ if (rdata.matenum) (*unmapped_counter)++;
+ else if (u_unmapped_counter) (*u_unmapped_counter)++;
+ }
+ else if (multimapped_counter) {
+ if (rdata.matenum) (*multimapped_counter)++;
+ else if (u_multimapped_counter) (*u_multimapped_counter)++;
+ }
}
return true;
}
@@ -2211,27 +2220,48 @@ struct ReportWorker {
insertions(NULL), deletions(NULL), rev_deletions(NULL), fusions(NULL), coverage(NULL),
final_junctions(NULL), final_insertions(NULL), final_deletions(NULL), final_fusions(NULL),
rt(r), begin_id(0), end_id(0), left_reads_offset(0), left_map_offset(0), right_reads_offset(0),
- right_map_offset(0), alnStats(s), is_paired(false) { }
+ right_map_offset(0), alnStats(s), PE_reads(false) { }
void write_singleton_alignments(uint64_t curr_obs_order,
HitsForRead& curr_hit_group, ReadStream& reads_file,
GBamWriter& bam_writer, FragmentType fragment_type,
GetReadProc* readProc=NULL,
- Read* gotRead=NULL) {
+ QReadData* gotRead=NULL) {
int64_t* unmapped_counter = & alnStats->num_unmapped_left;
int64_t* aligned_counter = & alnStats->num_aligned_left;
int64_t* aligned_counter_multi = & alnStats->num_aligned_left_multi;
int64_t* aligned_counter_xmulti = & alnStats->num_aligned_left_xmulti;
- if (fragment_type == FRAG_RIGHT) {
+
+ QReadData read;
+ if (gotRead==NULL) {
+ if (reads_file.getQRead(curr_obs_order, read,
+ begin_id, end_id, readProc))
+ gotRead=&read;
+ }
+ if (gotRead==NULL) {
+ //Should never happen!
+ warn_msg("Error: singleton getQRead() failed for id# %ld.\n", curr_obs_order);
+ return;
+ }
+
+ if (gotRead->matenum==0) {
+ fragment_type = FRAG_UNPAIRED;
+ if (PE_reads) {
+ unmapped_counter = & alnStats->num_unmapped_unpaired;
+ aligned_counter = & alnStats->num_aligned_unpaired;
+ aligned_counter_multi = & alnStats->num_aligned_unpaired_multi;
+ aligned_counter_xmulti = & alnStats->num_aligned_unpaired_xmulti;
+ }
+ } else if (fragment_type == FRAG_RIGHT) {
unmapped_counter = & alnStats->num_unmapped_right;
aligned_counter = & alnStats->num_aligned_right;
aligned_counter_multi = & alnStats->num_aligned_right_multi;
aligned_counter_xmulti = & alnStats->num_aligned_right_xmulti;
}
- if (is_paired && !report_mixed_alignments) {
- // FIXME: how did we get here?
- // can this be a problem if input reads are mixed: paired + single ?
- if (!gotRead) {
+
+ if (PE_reads && !report_mixed_alignments) {
+ //the user only wants paired alignments reported
+ /* if (!gotRead) {
if (curr_hit_group.hits.size()>1) {
(*aligned_counter_multi)++;
}
@@ -2239,60 +2269,52 @@ struct ReportWorker {
(*aligned_counter)++;
}
}
+ */
+ if (readProc) readProc->writeUnmapped(*gotRead);
+ (*unmapped_counter)++;
return;
}
HitsForRead best_hits;
best_hits.insert_id = curr_obs_order;
realign_reads(curr_hit_group, *rt, *junctions, *rev_junctions, *insertions,
- *deletions, *rev_deletions, *fusions);
+ *deletions, *rev_deletions, *fusions);
exclude_hits_on_filtered_junctions(*junctions, curr_hit_group);
// Process hits for singleton, select best alignments
const bool final_report = true;
char map_flags=read_best_alignments(curr_hit_group, best_hits, *gtf_junctions, *junctions,
- *insertions, *deletions, *fusions, *coverage, final_report, &rng);
+ *insertions, *deletions, *fusions, *coverage, final_report, &rng);
- string read_alt_name;
- bool got_read = false;
- if (gotRead!=NULL) {
- read_alt_name=gotRead->alt_name;
- got_read = true;
+ char map_code=0;
+ if (map_flags & 4) {
+ (*aligned_counter_multi)++;
}
+ if (map_flags & 16) {
+ map_code='M';
+ (*aligned_counter_xmulti)++;
+ }
+ if (map_flags & 1) (*aligned_counter)++; //acceptable mappings found
else {
- Read read;
- char map_code=0;
- if (map_flags & 4) {
- (*aligned_counter_multi)++;
- }
- if (map_flags & 16) {
- map_code='M';
- (*aligned_counter_xmulti)++;
- }
- if (map_flags & 1) (*aligned_counter)++; //acceptable mappings found
- else (*unmapped_counter)++;
- //CReadProc readProc(&um_out, unmapped_counter, aligned_counter_xmulti);
- got_read=reads_file.getRead(curr_obs_order, read, reads_format,
- false, begin_id, end_id, readProc, (map_flags & 1)==0 );
- //&um_out, map_code, unmapped_counter, aligned_counter_xmulti);
- read_alt_name=read.alt_name;
+ if (!gotRead->um_written)
+ (*unmapped_counter)++;
+ if (readProc)
+ readProc->writeUnmapped(*gotRead);
}
+
if (best_hits.hits.size() > 0) {
- if (got_read) {
update_junctions(best_hits, *final_junctions);
update_insertions_and_deletions(best_hits, *final_insertions,
- *final_deletions);
+ *final_deletions);
update_fusions(best_hits, *rt, *final_fusions, *fusions);
-
- print_sam_for_single(*rt, best_hits, fragment_type, read_alt_name,
- bam_writer, rng);
- }
- else {
- //Should never happen!
- fprintf(stderr, "Warning: getRead() failed for id# %ld.\n", curr_obs_order);
- }
+ if (!gotRead->um_written) {
+ print_sam_for_single(*rt, best_hits, fragment_type, gotRead->read.alt_name,
+ bam_writer, rng);
+ }
+ }
+ else {
+ readProc->writeUnmapped(*gotRead);
}
- //else reads_file.
}
void operator()() {
@@ -2301,7 +2323,7 @@ struct ReportWorker {
ReadTable it;
GBamWriter bam_writer(bam_output_fname.c_str(), sam_header.c_str());
- is_paired = right_map_fnames.size() > 0;
+ PE_reads = right_map_fnames.size() > 0;
ReadStream left_reads_file(left_reads_fname);
if (left_reads_file.file() == NULL)
@@ -2347,12 +2369,17 @@ struct ReportWorker {
const bool final_report = true;
// While we still have unreported hits...
- Read l_read;
- Read r_read;
+ QReadData l_read;
+ QReadData r_read;
+ int64_t* num_unmapped_unpaired = (PE_reads) ? &(alnStats->num_unmapped_unpaired) :
+ &(alnStats->num_unmapped_left);
+ int64_t* num_aligned_unpaired_xmulti= (PE_reads) ? &(alnStats->num_aligned_unpaired_xmulti) :
+ &(alnStats->num_aligned_left_xmulti);
CReadProc l_readProc(left_um_out, &(alnStats->num_unmapped_left),
- &(alnStats->num_aligned_left_xmulti));
+ &(alnStats->num_aligned_left_xmulti), num_unmapped_unpaired, num_aligned_unpaired_xmulti);
CReadProc r_readProc(right_um_out, &(alnStats->num_unmapped_right),
- &(alnStats->num_aligned_right_xmulti));
+ &(alnStats->num_aligned_right_xmulti), &(alnStats->num_unmapped_unpaired),
+ &(alnStats->num_aligned_unpaired_xmulti));
while ((curr_left_obs_order != VMAXINT32
|| curr_right_obs_order != VMAXINT32)
@@ -2366,7 +2393,7 @@ struct ReportWorker {
&& curr_left_obs_order < end_id && curr_left_obs_order != VMAXINT32) {
write_singleton_alignments(curr_left_obs_order, curr_left_hit_group,
left_reads_file, bam_writer, //*left_um_out,
- is_paired ? FRAG_LEFT : FRAG_UNPAIRED, &l_readProc);
+ PE_reads ? FRAG_LEFT : FRAG_UNPAIRED, &l_readProc);
// Get next hit group
left_hs.next_read_hits(curr_left_hit_group);
@@ -2387,7 +2414,7 @@ struct ReportWorker {
}
// Since we have both left hits and right hits for this insert,
- // Find the best pairing and print both
+ // find the best pairing and print both alignments
while (curr_left_obs_order == curr_right_obs_order
&& curr_left_obs_order < end_id && curr_left_obs_order != VMAXINT32) {
realign_reads(curr_left_hit_group, *rt, *junctions, *rev_junctions,
@@ -2403,44 +2430,54 @@ struct ReportWorker {
bool paired_alignments = curr_left_hit_group.hits.size() > 0
&& curr_right_hit_group.hits.size() > 0;
InsertAlignmentGrade grade;
- bool got_left_read = false;
- bool got_right_read = false;
- if (paired_alignments) {
- char pair_map_flags=pair_best_alignments(curr_left_hit_group, curr_right_hit_group, grade,
+ bool got_left_read = left_reads_file.getQRead(curr_left_obs_order,
+ l_read, begin_id, end_id, &l_readProc);
+ if (!got_left_read) {
+ warn_msg("Error: failed to retrieve left read for pair # %ld !\n", curr_left_obs_order);
+ }
+ bool got_right_read = right_reads_file.getQRead(curr_right_obs_order, r_read,
+ begin_id, end_id, &r_readProc);
+ if (!got_right_read) {
+ warn_msg("Error: failed to retrieve right read for pair # %ld !\n", curr_right_obs_order);
+ }
+ char pair_map_flags=0;
+ if (paired_alignments) { //there are *some* paired alignments
+ pair_map_flags=pair_best_alignments(curr_left_hit_group, curr_right_hit_group, grade,
best_hits, *gtf_junctions, *junctions, *insertions, *deletions,
*fusions, *coverage, final_report, &rng);
+ /*
if (pair_map_flags & 1) alnStats->num_aligned_left++;
else alnStats->num_unmapped_left++;
if (pair_map_flags & 2) alnStats->num_aligned_right++;
else alnStats->num_unmapped_right++;
if ((pair_map_flags & 3)==3) //at least one acceptable alignment was found for each read
alnStats->num_aligned_pairs++;
- if (pair_map_flags & 4) alnStats->num_aligned_left_multi++;
- if (pair_map_flags & 8) alnStats->num_aligned_right_multi++;
- if ((pair_map_flags & 12) == 12) alnStats->num_aligned_pairs_multi++;
char left_map_code=0;
+ */
+ /*
+ //if (pair_map_flags & 4) alnStats->num_aligned_left_multi++;
+ //if (pair_map_flags & 8) alnStats->num_aligned_right_multi++;
+ if ((pair_map_flags & 12) == 12) alnStats->num_aligned_pairs_multi++;
+
if (pair_map_flags & 16) {
- left_map_code='M';
+ //left_map_code='M';
alnStats->num_aligned_left_xmulti++;
}
char right_map_code=0;
if (pair_map_flags & 32) {
- right_map_code='M';
+ //right_map_code='M';
alnStats->num_aligned_right_xmulti++;
}
- got_left_read = left_reads_file.getRead(
- curr_left_obs_order, l_read, reads_format, false,
- begin_id, end_id, &l_readProc, (pair_map_flags & 1)==0);
- //left_um_out, left_map_code, &(alnStats->num_unmapped_left),
- //&(alnStats->num_aligned_left_xmulti));
-
- got_right_read = right_reads_file.getRead(
- curr_right_obs_order, r_read, reads_format, false,
- begin_id, end_id, &r_readProc, (pair_map_flags & 2)==0);
- //right_um_out, right_map_code, &(alnStats->num_unmapped_right),
- //&(alnStats->num_aligned_right_xmulti));
- //FIXME: what's the best way to check here if the pair alignment is discordant?
- //if (((pair_map_flags & 3)==3) && (best_hits.size() <= 0 || grade.fusion)) {
+
+ //l_read should be written as unmapped if (pair_map_flags & 1)==0
+ if (got_left_read && (pair_map_flags & 1)==0) {
+ l_readProc.writeUnmapped(l_read);
+ }
+ // r_read should be written as unmapped if (pair_map_flags & 2)==0
+ if (got_right_read && (pair_map_flags & 2)==0) {
+ r_readProc.writeUnmapped(r_read);
+ }
+ */
if (((pair_map_flags & 3)==3) && !grade.concordant()) {
alnStats->num_aligned_pairs_disc++;
}
@@ -2451,7 +2488,8 @@ struct ReportWorker {
paired_alignments = false;
}
}
- if (paired_alignments) {
+ if (paired_alignments) { //after some filtering, these paired reads still
+ // have some alignments together
HitsForRead best_left_hit_group;
best_left_hit_group.insert_id = curr_left_obs_order;
HitsForRead best_right_hit_group;
@@ -2463,16 +2501,7 @@ struct ReportWorker {
}
if (best_hits.size() > 0) {
-/*
- bool got_left_read = left_reads_file.getRead(
- best_hits[0].first.insert_id(), l_read, reads_format, false,
- begin_id, end_id, left_um_out, 0, &(alnStats->num_unmapped_left));
-
- bool got_right_read = right_reads_file.getRead(
- best_hits[0].first.insert_id(), r_read, reads_format, false,
- begin_id, end_id, right_um_out, 0, &(alnStats->num_unmapped_right));
-*/
- if (got_left_read && got_right_read) {
+ if (got_left_read && got_right_read) { //should always be true
update_junctions(best_left_hit_group, *final_junctions);
update_insertions_and_deletions(best_left_hit_group,
*final_insertions, *final_deletions);
@@ -2488,25 +2517,66 @@ struct ReportWorker {
pair_support(best_hits, *final_fusions, *fusions);
print_sam_for_pair(*rt, best_hits, grade, bam_writer,
- l_read.alt_name, r_read.alt_name, rng, begin_id, end_id);
- }
+ l_read.read.alt_name, r_read.read.alt_name, rng, begin_id, end_id);
+ l_read.um_written=true;
+ r_read.um_written=true;
+ } /*
else {
- fprintf(stderr, "Warning: couldn't get reads for pair #%ld (%d, %d)\n",
+ fprintf(stderr, "Error: couldn't get reads for pair #%ld (%d, %d)\n",
curr_left_obs_order, int(got_left_read), int(got_right_read));
- }
+ }*/
+ }
+
+ if (best_hits.size()>0) {
+ alnStats->num_aligned_left++;
+ alnStats->num_aligned_right++;
+ alnStats->num_aligned_pairs++;
+ }
+ else {
+ alnStats->num_unmapped_left++;
+ alnStats->num_unmapped_right++;
+ }
+ if (pair_map_flags & 4) alnStats->num_aligned_left_multi++;
+ if (pair_map_flags & 8) alnStats->num_aligned_right_multi++;
+ if ((pair_map_flags & 12) == 12) alnStats->num_aligned_pairs_multi++;
+
+ if (pair_map_flags & 16) {
+ //left_map_code='M';
+ alnStats->num_aligned_left_xmulti++;
+ }
+ if (pair_map_flags & 32) {
+ //right_map_code='M';
+ alnStats->num_aligned_right_xmulti++;
+ }
+
+ //l_read should be written as unmapped if (pair_map_flags & 1)==0
+ if (got_left_read && (pair_map_flags & 1)==0) {
+ l_readProc.writeUnmapped(l_read);
+ }
+ // r_read should be written as unmapped if (pair_map_flags & 2)==0
+ if (got_right_read && (pair_map_flags & 2)==0) {
+ r_readProc.writeUnmapped(r_read);
}
}
- else { //alignments not paired properly
+ else { //paired reads with alignments not paired properly
if (curr_left_hit_group.hits.size() > 0) {
write_singleton_alignments(curr_left_obs_order, curr_left_hit_group,
left_reads_file, bam_writer, //*left_um_out,
- is_paired ? FRAG_LEFT : FRAG_UNPAIRED, &l_readProc, &l_read);
+ PE_reads ? FRAG_LEFT : FRAG_UNPAIRED, &l_readProc,
+ got_left_read ? &l_read : NULL);
+ }
+ else {
+ l_readProc.writeUnmapped(l_read);
+ alnStats->num_unmapped_left++;
}
if (curr_right_hit_group.hits.size() > 0) { //only right read mapped
write_singleton_alignments(curr_right_obs_order,
curr_right_hit_group, right_reads_file, bam_writer, //*right_um_out,
- FRAG_RIGHT, &r_readProc, &r_read);
+ FRAG_RIGHT, &r_readProc, got_right_read ? &r_read : NULL);
+ } else {
+ r_readProc.writeUnmapped(r_read);
+ alnStats->num_unmapped_right++;
}
}
@@ -2522,11 +2592,11 @@ struct ReportWorker {
//print the remaining unmapped reads at the end of each reads' stream
- left_reads_file.getRead(VMAXINT32, l_read, reads_format, false, begin_id,
+ left_reads_file.getQRead(VMAXINT32, l_read, begin_id,
end_id, &l_readProc);
//left_um_out, 0, &(alnStats->num_unmapped_left), &(alnStats->num_aligned_left_xmulti));
if (right_reads_file.file())
- right_reads_file.getRead(VMAXINT32, r_read, reads_format, false, begin_id,
+ right_reads_file.getQRead(VMAXINT32, r_read, begin_id,
end_id, &r_readProc);
//right_um_out, 0, &(alnStats->num_unmapped_right), &(alnStats->num_aligned_right_xmulti));
@@ -2573,7 +2643,7 @@ struct ReportWorker {
//read alignment accounting:
SAlignStats* alnStats;
- bool is_paired;
+ bool PE_reads;
boost::mt19937 rng;
};
@@ -2945,7 +3015,7 @@ int main(int argc, char** argv)
fprintf(stderr, "tophat_reports v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
fprintf(stderr, "---------------------------------------\n");
- reads_format = FASTQ;
+ //reads_format = FASTQ;
int parse_ret = parse_options(argc, argv, print_usage);
if (parse_ret)
--
Alioth's /git/debian-med/git-commit-notice on /srv/git.debian.org/git/debian-med/tophat.git
More information about the debian-med-commit
mailing list