[med-svn] [diamond-aligner] 01/07: New upstream version 0.9.10+dfsg

Andreas Tille tille at debian.org
Mon Sep 18 13:16:24 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository diamond-aligner.

commit baec593e94cc4cfd918fa334eb9493a638abdc70
Author: Andreas Tille <tille at debian.org>
Date:   Mon Sep 18 13:30:39 2017 +0200

    New upstream version 0.9.10+dfsg
---
 CMakeLists.txt                                   |   4 +
 README.rst                                       |   2 +-
 build_simple.sh                                  |   4 +
 src/ChangeLog                                    |  14 +-
 src/align/align.cpp                              | 288 +++++------------------
 src/align/align.h                                |  64 +----
 src/align/align_queries.h                        |  67 ------
 src/align/align_target.cpp                       |   4 +
 src/align/query_mapper.cpp                       | 133 ++++++-----
 src/align/query_mapper.h                         |  16 ++
 src/basic/basic.cpp                              |   4 +-
 src/basic/config.cpp                             |  28 ++-
 src/basic/config.h                               |   6 +-
 src/basic/const.h                                |   2 +-
 src/basic/hssp.cpp                               |  30 ++-
 src/basic/match.h                                |   2 +
 src/basic/sequence.h                             |   9 +-
 src/basic/value.h                                |   2 +-
 src/data/load_seqs.h                             |  12 +-
 src/data/reference.cpp                           |  22 ++
 src/data/reference.h                             |   9 +-
 src/data/taxonomy.cpp                            |  81 ++++++-
 src/data/taxonomy.h                              |  15 ++
 src/dp/dp.h                                      |   7 +-
 src/dp/swipe.cpp                                 |   2 +-
 src/extra/match_file.h                           |   1 +
 src/output/blast_pairwise_format.cpp             |   5 +-
 src/output/blast_tab_format.cpp                  |   8 +-
 src/output/join_blocks.cpp                       |  51 +++-
 src/output/output.h                              |  39 ++-
 src/output/output_format.cpp                     |  25 +-
 src/output/output_format.h                       |  69 +++++-
 src/output/output_sink.cpp                       |  86 +++++++
 src/output/sam_format.cpp                        |   2 +-
 src/{basic/const.h => output/target_culling.cpp} |  30 +--
 src/output/target_culling.h                      |  73 ++++++
 src/output/taxon_format.cpp                      |  51 ++++
 src/output/{view.h => view.cpp}                  |  67 +++---
 src/run/double_indexed.cpp                       |  10 +-
 src/run/main.cpp                                 |   7 +-
 src/run/tools.cpp                                |   6 +-
 src/util/log_stream.h                            |   5 +
 src/{basic/const.h => util/queue.h}              |  59 +++--
 43 files changed, 830 insertions(+), 591 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 05d6c12..35ee8bf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,6 +85,10 @@ add_executable(diamond src/run/main.cpp
   src/data/seed_set.cpp
   src/util/binary_file.cpp
   src/util/simd.cpp
+  src/output/taxon_format.cpp
+  src/output/view.cpp
+  src/output/output_sink.cpp
+  src/output/target_culling.cpp
 )
 
 target_link_libraries(diamond ${ZLIB_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
diff --git a/README.rst b/README.rst
index 7d043f3..40c77f1 100644
--- a/README.rst
+++ b/README.rst
@@ -8,7 +8,7 @@ Please read the `manual <https://github.com/bbuchfink/diamond/raw/master/diamond
 
 Installing the software on your system may be done by downloading it in binary format for immediate use::
 
-    wget http://github.com/bbuchfink/diamond/releases/download/v0.9.8/diamond-linux64.tar.gz
+    wget http://github.com/bbuchfink/diamond/releases/download/v0.9.10/diamond-linux64.tar.gz
     tar xzf diamond-linux64.tar.gz
 
 The extracted ``diamond`` binary file should be moved to a directory contained in your executable search path (PATH environment variable).
diff --git a/build_simple.sh b/build_simple.sh
index 17fb696..6a8a4dc 100755
--- a/build_simple.sh
+++ b/build_simple.sh
@@ -53,4 +53,8 @@ g++ -DNDEBUG -O3 -Wno-deprecated-declarations $1 $2 $3 \
   src/data/seed_set.cpp \
   src/util/binary_file.cpp \
   src/util/simd.cpp \
+  src/output/taxon_format.cpp \
+  src/output/view.cpp \
+  src/output/output_sink.cpp \
+  src/output/target_culling.cpp \
 -lz -lpthread -o diamond
diff --git a/src/ChangeLog b/src/ChangeLog
index b1f7103..feaca8c 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,13 @@
+[0.9.10]
+- added --strand option to choose query strand
+- added dbinfo command
+
+[0.9.9]
+- Added taxonomic classification format.
+- Fixed a bug in getseq printing masked residues.
+- Fixed parsing of UniRef100_ sequence id prefixes.
+- Added support for using the staxids output field in diamond view.
+
 [0.9.8]
 - Fixed a compiler errror.
 
@@ -205,7 +215,7 @@
 
 [0.7.11]
 - added --version switch
-- added static build optiom for CMake build
+- added static build option for CMake build
 - fixed a bug that would cause a return code of 1 without an error
 
 [0.7.10]
@@ -270,4 +280,4 @@
 - reduced usage of temporary disk space and memory
 - fixed a compiler error for GCC 4.1.2
 - fixed a bug that could cause the program to hang when running out of temporary space or memory
-- added --forwardonly option to view command
+- added --forwardonly option to view command
\ No newline at end of file
diff --git a/src/align/align.cpp b/src/align/align.cpp
index 8b6795c..2e60268 100644
--- a/src/align/align.cpp
+++ b/src/align/align.cpp
@@ -18,93 +18,65 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 #include "../basic/value.h"
 #include "align.h"
-#include "align_queries.h"
 #include "../data/reference.h"
 #include "../output/output_format.h"
+#include "../util/queue.h"
+#include "../output/output.h"
+#include "query_mapper.h"
+#include "../util/merge_sort.h"
 
 using std::map;
 
-auto_ptr<Simple_query_queue> Simple_query_queue::instance;
-auto_ptr<Output_sink> Output_sink::instance;
-
-size_t Simple_query_queue::get(vector<hit>::iterator &begin, vector<hit>::iterator &end)
+struct Align_fetcher
 {
-	mtx_.lock();
-	const unsigned query = (unsigned)(next_++),
-		c = align_mode.query_contexts;
-	if (query >= qend_) {
-		mtx_.unlock();
-		return Simple_query_queue::end;
+	static void init(size_t qbegin, size_t qend, vector<hit>::iterator begin, vector<hit>::iterator end)
+	{
+		it_ = begin;
+		end_ = end;
+		queue_ = auto_ptr<Queue>(new Queue(qbegin, qend));
 	}
-	begin = it_;
-	while (it_ < end_ && it_->query_ / c == query)
-		++it_;
-	end = it_;
-	mtx_.unlock();	
-	return query;
-}
-
-void Output_sink::push(size_t n, Text_buffer *buf)
-{
-	mtx_.lock();
-	//cout << "n=" << n << " next=" << next_ << endl;
-	if (n != next_) {
-		backlog_[n] = buf;
-		size_ += buf ? buf->alloc_size() : 0;
-		max_size_ = std::max(max_size_, size_);
-		mtx_.unlock();
+	void operator()(size_t query)
+	{
+		const unsigned q = (unsigned)query,
+			c = align_mode.query_contexts;
+		begin = it_;
+		while (it_ < end_ && it_->query_ / c == q)
+			++it_;
+		end = it_;
+		this->query = query;
 	}
-	else
-		flush(buf);
-}
+	bool get()
+	{
+		return queue_->get(*this) != Queue::end;
+	}
+	size_t query;
+	vector<hit>::iterator begin, end;
+private:	
+	static vector<hit>::iterator it_, end_;
+	static auto_ptr<Queue> queue_;
+};
 
-void Output_sink::flush(Text_buffer *buf)
-{
-	size_t n = next_ + 1;
-	vector<Text_buffer*> out;
-	out.push_back(buf);
-	map<size_t, Text_buffer*>::iterator i;
-	do {
-		while ((i = backlog_.begin()) != backlog_.end() && i->first == n) {
-			out.push_back(i->second);
-			backlog_.erase(i);
-			++n;
-		}
-		mtx_.unlock();
-		size_t size = 0;
-		for (vector<Text_buffer*>::iterator j = out.begin(); j < out.end(); ++j) {
-			if (*j) {
-				f_->write((*j)->get_begin(), (*j)->size());
-				if(*j != buf)
-					size += (*j)->alloc_size();
-				delete *j;
-			}
-		}
-		out.clear();
-		mtx_.lock();
-		size_ -= size;
-	} while ((i = backlog_.begin()) != backlog_.end() && i->first == n);
-	next_ = n;
-	mtx_.unlock();
-}
+auto_ptr<Queue> Align_fetcher::queue_;
+vector<hit>::iterator Align_fetcher::it_;
+vector<hit>::iterator Align_fetcher::end_;
 
 void align_worker(size_t thread_id)
 {
-	vector<hit>::iterator begin, end;
-	size_t query;
+	Align_fetcher hits;
 	Statistics stat;
-	while ((query = Simple_query_queue::get().get(begin, end)) != Simple_query_queue::end) {
-		if (end == begin) {
+	while (hits.get()) {
+		if (hits.end == hits.begin) {
 			Text_buffer *buf = 0;
 			if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0) {
 				buf = new Text_buffer;
-				output_format->print_query_intro(query, query_ids::get()[query].c_str(), get_source_query_len((unsigned)query), *buf, true);
-				output_format->print_query_epilog(*buf, true);
+				const char *query_title = query_ids::get()[hits.query].c_str();
+				output_format->print_query_intro(hits.query, query_title, get_source_query_len((unsigned)hits.query), *buf, true);
+				output_format->print_query_epilog(*buf, query_title, true);
 			}
-			Output_sink::get().push(query, buf);
+			Output_sink::get().push(hits.query, buf);
 			continue;
 		}
-		Query_mapper mapper(query, begin, end);
+		Query_mapper mapper(hits.query, hits.begin, hits.end);
 		mapper.init();
 		if (config.ext == Config::swipe)
 			mapper.align_targets(stat);
@@ -113,11 +85,13 @@ void align_worker(size_t thread_id)
 			for (size_t i = 0; i < mapper.n_targets(); ++i)
 				mapper.ungapped_stage(i);
 			if (config.ext != Config::most_greedy) {
+				mapper.fill_source_ranges();
 				mapper.rank_targets(config.rank_ratio == -1 ? (mapper.query_seq(0).length() > 50 ? 0.6 : 0.9) : config.rank_ratio);
 				stat.inc(Statistics::TARGET_HITS1, mapper.n_targets());
 				const int cutoff = int(mapper.raw_score_cutoff() * config.score_ratio);
 				for (size_t i = 0; i < mapper.n_targets(); ++i)
 					mapper.greedy_stage(i, stat, cutoff);
+				mapper.fill_source_ranges();
 				mapper.rank_targets(config.rank_ratio2 == -1 ? (mapper.query_seq(0).length() > 50 ? 0.95 : 1.0) : config.rank_ratio2);
 				stat.inc(Statistics::TARGET_HITS2, mapper.n_targets());
 				for (size_t i = 0; i < mapper.n_targets(); ++i)
@@ -129,157 +103,18 @@ void align_worker(size_t thread_id)
 			buf = new Text_buffer;
 			const bool aligned = mapper.generate_output(*buf, stat);
 			if (aligned && !config.unaligned.empty())
-				query_aligned[query] = true;
-		}
-		Output_sink::get().push(query, buf);
-	}
-	statistics += stat;
-}
-
-void heartbeat_worker()
-{
-	static const int interval = 100;
-	int n = 0;
-	while (Output_sink::get().next() < Simple_query_queue::get().qend()) {
-		if (n == interval) {
-			const string title(query_ids::get()[Output_sink::get().next()].c_str());
-			verbose_stream << "Queries=" << Simple_query_queue::get().next() << " size=" << megabytes(Output_sink::get().size()) << " max_size=" << megabytes(Output_sink::get().max_size())
-				<< " next=" << title.substr(0, title.find(' ')) << endl;
-			n = 0;
-		}
-		else
-			++n;
-		tthread::this_thread::sleep_for(tthread::chrono::milliseconds(10));
-	}
-}
-
-Query_queue query_queue;
-
-void Query_queue::init(Trace_pt_list::iterator begin, Trace_pt_list::iterator end)
-{
-	trace_pt_pos = begin;
-	trace_pt_end = end;
-	assert(queue.empty());
-	assert(out_queue.empty());
-	n = 0;
-}
-	
-void Query_queue::flush(Output_stream *out, Statistics &stat)
-{
-	writing = true;
-	std::queue<Query_data*> q;
-	while (true) {
-		while (!out_queue.empty() && out_queue.front()->state == Query_data::finished) {
-			q.push(out_queue.front());
-			out_queue.pop();
-		}
-		if (q.empty()) {
-			writing = false;
-			lock.unlock();
-			return;
-		}
-		lock.unlock();
-
-		unsigned k = 0;
-		while (!q.empty()) {
-			out->write(q.front()->buf.get_begin(), q.front()->buf.size());
-			delete q.front();
-			q.pop();
-			++k;
+				query_aligned[hits.query] = true;
 		}
-
-		lock.lock();
-		n -= k;
-		/*if (n > 100)
-		cout << "qlen=" << out_queue.size() << " finished=" << n << endl;*/
+		Output_sink::get().push(hits.query, buf);
 	}
-}
-
-Query_data* Query_queue::get()
-{
-	for (std::deque<Query_data*>::iterator i = queue.begin(); i != queue.end(); ++i)
-		if ((*i)->state == Query_data::free)
-			return *i;
-	return 0;
-}
-
-void Query_queue::pop_busy()
-{
-	while (!queue.empty() && (queue.front()->state == Query_data::closing || queue.front()->state == Query_data::finished)) {
-		out_queue.push(queue.front());
-		queue.pop_front();
-	}
-}
-
-void align_worker(Output_stream *out)
-{
-	Statistics stat;
-	Query_data *data = 0;
-	unsigned n_targets = 0;
-
-	while (true) {
-
-		query_queue.lock.lock();
-
-		if (data) {
-			data->mapper->targets_finished += n_targets;
-			if (data->mapper->finished()) {
-				query_queue.lock.unlock();
-				const bool aligned = data->mapper->generate_output(data->buf, stat);
-				const unsigned query_id = data->mapper->query_id;
-				delete data->mapper;
-				query_queue.lock.lock();
-				data->state = Query_data::finished;
-				if (aligned && !config.unaligned.empty())
-					query_aligned[query_id] = true;
-				++query_queue.n;
-				if (!query_queue.writing && !query_queue.out_queue.empty() && data == query_queue.out_queue.front()) {
-					query_queue.flush(out, stat);
-					data = 0;
-					continue;
-				}
-			}
-		}
-
-		if (!(data = query_queue.get())) {
-			if (query_queue.trace_pt_pos >= query_queue.trace_pt_end) {
-				query_queue.lock.unlock();
-				break;
-			}
-			else {
-				query_queue.queue.push_back(new Query_data(new Query_mapper()));
-				data = query_queue.queue.back();
-				query_queue.lock.unlock();
-				data->mapper->init();
-				data->state = Query_data::free;
-				data = 0;
-			}
-		}
-		else {
-			size_t target = data->mapper->next_target;
-			n_targets = std::min(config.target_fetch_size, (unsigned)data->mapper->n_targets() - data->mapper->next_target);
-			data->mapper->next_target += n_targets;
-			if (target + n_targets == data->mapper->n_targets()) {
-				data->state = Query_data::closing;
-				query_queue.pop_busy();
-			}
-			query_queue.lock.unlock();
-
-			for (unsigned i = 0; i < n_targets; ++i)
-				data->mapper->align_target(target + i, stat);
-		}
-
-	}
-
 	statistics += stat;
 }
 
 void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file)
 {
-	query_queue.last_query = (unsigned)-1;
 	const size_t max_size = (size_t)std::min(config.chunk_size*1e9 * 9 * 2 / config.lowmem, 2e9);
 	pair<size_t, size_t> query_range;
-	while(true) {
+	while (true) {
 		task_timer timer("Loading trace points", 3);
 		Trace_pt_list *v = new Trace_pt_list;
 		statistics.max(Statistics::TEMP_SPACE, trace_pts.load(*v, max_size, query_range));
@@ -291,32 +126,15 @@ void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file)
 		merge_sort(v->begin(), v->end(), config.threads_);
 		v->init();
 		timer.go("Computing alignments");
-		if (config.load_balancing == Config::target_parallel) {
-			query_queue.init(v->begin(), v->end());
-			Thread_pool threads;
-			for (unsigned i = 0; i < config.threads_; ++i)
-				threads.push_back(launch_thread(static_cast<void (*)(Output_stream*)>(&align_worker), output_file));
-			threads.join_all();
-		}
-		else {
-			Simple_query_queue::instance = auto_ptr<Simple_query_queue>(new Simple_query_queue(query_range.first, query_range.second, v->begin(), v->end()));
-			Output_sink::instance = auto_ptr<Output_sink>(new Output_sink(query_range.first, output_file));
-			Thread_pool threads;
-			if (config.verbosity >= 3)
-				threads.push_back(launch_thread(heartbeat_worker));
-			for (size_t i = 0; i < (config.threads_align == 0 ? config.threads_ : config.threads_align); ++i)
-				threads.push_back(launch_thread(static_cast<void(*)(size_t)>(&align_worker), i));
-			threads.join_all();
-		}
+		Align_fetcher::init(query_range.first, query_range.second, v->begin(), v->end());
+		Output_sink::instance = auto_ptr<Output_sink>(new Output_sink(query_range.first, output_file));
+		Thread_pool threads;
+		if (config.verbosity >= 3)
+			threads.push_back(launch_thread(heartbeat_worker, query_range.second));
+		for (size_t i = 0; i < (config.threads_align == 0 ? config.threads_ : config.threads_align); ++i)
+			threads.push_back(launch_thread(static_cast<void(*)(size_t)>(&align_worker), i));
+		threads.join_all();
 		timer.go("Deallocating buffers");
 		delete v;
 	}
-	if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0 && config.load_balancing == Config::target_parallel) {
-		Text_buffer buf;
-		for (unsigned i = query_queue.last_query + 1; i < query_ids::get().get_length(); ++i) {
-			output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), buf, true);
-			output_format->print_query_epilog(buf, true);
-		}
-		output_file->write(buf.get_begin(), buf.size());
-	}
 }
\ No newline at end of file
diff --git a/src/align/align.h b/src/align/align.h
index 33588b7..07d4b27 100644
--- a/src/align/align.h
+++ b/src/align/align.h
@@ -64,68 +64,6 @@ private:
 	Task_queue<_buffer, Output_writer> queue;
 };
 
-struct Simple_query_queue
-{
-	enum { end = 0xffffffffffffffffllu };
-	Simple_query_queue(size_t qbegin, size_t qend, vector<hit>::iterator begin, vector<hit>::iterator end):
-		next_(qbegin),
-		qend_(qend),
-		it_(begin),
-		end_(end)
-	{}
-	size_t get(vector<hit>::iterator &begin, vector<hit>::iterator &end);
-	size_t next() const
-	{
-		return next_;
-	}
-	size_t qend() const
-	{
-		return qend_;
-	}
-	static Simple_query_queue& get()
-	{
-		return *instance;
-	}
-	static auto_ptr<Simple_query_queue> instance;
-private:
-	tthread::mutex mtx_;
-	size_t next_;
-	const size_t qend_;
-	vector<hit>::iterator it_, end_;
-};
-
-struct Output_sink
-{
-	Output_sink(size_t begin, Output_stream *f):
-		f_(f),
-		next_(begin),
-		size_(0),
-		max_size_(0)
-	{}
-	void push(size_t n, Text_buffer *buf);
-	size_t size() const
-	{
-		return size_;
-	}
-	size_t max_size() const
-	{
-		return max_size_;
-	}
-	static Output_sink& get()
-	{
-		return *instance;
-	}
-	size_t next() const
-	{
-		return next_;
-	}
-	static auto_ptr<Output_sink> instance;
-private:
-	void flush(Text_buffer *buf);
-	tthread::mutex mtx_;
-	Output_stream* const f_;
-	std::map<size_t, Text_buffer*> backlog_;
-	size_t next_, size_, max_size_;
-};
+void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file);
 
 #endif
\ No newline at end of file
diff --git a/src/align/align_queries.h b/src/align/align_queries.h
deleted file mode 100644
index 8409f46..0000000
--- a/src/align/align_queries.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/****
-DIAMOND protein aligner
-Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as
-published by the Free Software Foundation, either version 3 of the
-License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU Affero General Public License for more details.
-
-You should have received a copy of the GNU Affero General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-****/
-
-#ifndef ALIGN_QUERIES_H_
-#define ALIGN_QUERIES_H_
-
-#include <deque>
-#include "../util/merge_sort.h"
-#include "../search/trace_pt_buffer.h"
-#include "../util/map.h"
-#include "../util/task_queue.h"
-#include "align.h"
-#include "query_mapper.h"
-
-using std::vector;
-
-struct Query_data
-{
-	Query_data():
-		mapper(0)
-	{}
-	Query_data(Query_mapper *mapper):
-		mapper(mapper),
-		state(init)
-	{}
-	Query_mapper *mapper;
-	Text_buffer buf;
-	enum { init, free, closing, finished };
-	unsigned state;
-};
-
-struct Query_queue
-{
-	void init(Trace_pt_list::iterator begin, Trace_pt_list::iterator end);
-	void flush(Output_stream *out, Statistics &stat);
-	Query_data* get();
-	void pop_busy();
-
-	tthread::mutex lock;
-	std::deque<Query_data*> queue;
-	std::queue<Query_data*> out_queue;
-	Trace_pt_list::iterator trace_pt_pos, trace_pt_end;
-	bool writing;
-	unsigned n, last_query;
-};
-
-extern Query_queue query_queue;
-
-void align_worker(Output_stream *out);
-void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file);
-
-#endif /* ALIGN_QUERIES_H_ */
diff --git a/src/align/align_target.cpp b/src/align/align_target.cpp
index 43af301..e027ac6 100644
--- a/src/align/align_target.cpp
+++ b/src/align/align_target.cpp
@@ -272,6 +272,10 @@ void Query_mapper::align_target(size_t idx, Statistics &stat)
 	target.hsps.sort();
 	if(target.hsps.size() > 0)
 		target.filter_score = target.hsps.front().score;
+
+	target.ts.clear();
+	for (list<Hsp_data>::iterator i = target.hsps.begin(); i != target.hsps.end(); ++i)
+		target.ts.push_back(Hsp_traits(i->query_source_range));
 	
 	if (config.use_smith_waterman && !target.hsps.empty()) {
 		int score;
diff --git a/src/align/query_mapper.cpp b/src/align/query_mapper.cpp
index 401a52e..12cb96e 100644
--- a/src/align/query_mapper.cpp
+++ b/src/align/query_mapper.cpp
@@ -16,24 +16,36 @@ You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
-#include "align_queries.h"
 #include "query_mapper.h"
 #include "../data/reference.h"
 #include "extend_ungapped.h"
 #include "../output/output.h"
 #include "../output/output_format.h"
 #include "../output/daa_write.h"
+#include "../output/target_culling.h"
 
-Query_mapper::Query_mapper() :
-	source_hits(get_query_data()),
-	query_id(source_hits.first->query_ / align_mode.query_contexts),
-	targets_finished(0),
-	next_target(0),
-	source_query_len(get_source_query_len(query_id)),
-	unaligned_from(query_queue.last_query+1)
-{	
-	query_queue.last_query = query_id;
-	seed_hits.reserve(source_hits.second - source_hits.first);
+bool Target::envelopes(const Hsp_traits &t, double p) const
+{
+	for (list<Hsp_traits>::const_iterator i = ts.begin(); i != ts.end(); ++i)
+		if (t.query_source_range.overlap_factor(i->query_source_range) >= p)
+			return true;
+	return false;
+}
+
+bool Target::is_enveloped(const Target &t, double p) const
+{
+	for (list<Hsp_traits>::const_iterator i = ts.begin(); i != ts.end(); ++i)
+		if (!t.envelopes(*i, p))
+			return false;
+	return true;
+}
+
+bool Target::is_enveloped(Ptr_vector<Target>::const_iterator begin, Ptr_vector<Target>::const_iterator end, double p, int min_score) const
+{
+	for (; begin != end; ++begin)
+		if (is_enveloped(**begin, p) && (*begin)->filter_score >= min_score)
+			return true;
+	return false;
 }
 
 int Query_mapper::raw_score_cutoff() const
@@ -70,18 +82,6 @@ void Query_mapper::init()
 		rank_targets(config.rank_ratio == -1 ? 0.6 : config.rank_ratio);
 }
 
-pair<Trace_pt_list::iterator, Trace_pt_list::iterator> Query_mapper::get_query_data()
-{
-	const Trace_pt_list::iterator begin = query_queue.trace_pt_pos;
-	if (begin == query_queue.trace_pt_end)
-		return pair<Trace_pt_list::iterator, Trace_pt_list::iterator>(begin, begin);
-	const unsigned c = align_mode.query_contexts, query = begin->query_ / c;
-	Trace_pt_list::iterator end = begin;
-	for (; end < query_queue.trace_pt_end && end->query_ / c == query; ++end);
-	query_queue.trace_pt_pos = end;
-	return pair<Trace_pt_list::iterator, Trace_pt_list::iterator>(begin, end);
-}
-
 unsigned Query_mapper::count_targets()
 {
 	std::sort(source_hits.first, source_hits.second, hit::cmp_subject);
@@ -132,49 +132,64 @@ void Query_mapper::rank_targets(double ratio)
 {
 	std::sort(targets.begin(), targets.end(), Target::compare);
 
-	int score = 0;
-	if (config.toppercent < 100) {
-		score = int((double)targets[0].filter_score * (1.0 - config.toppercent / 100.0) * ratio);
+	if (config.query_overlap_culling == 0.0) {
+
+		int score = 0;
+		if (config.toppercent < 100) {
+			score = int((double)targets[0].filter_score * (1.0 - config.toppercent / 100.0) * ratio);
+		}
+		else {
+			size_t min_idx = std::min(targets.size(), (size_t)config.max_alignments);
+			score = int((double)targets[min_idx - 1].filter_score * ratio);
+		}
+
+		unsigned i = 0;
+		for (; i < targets.size(); ++i)
+			if (targets[i].filter_score < score)
+				break;
+
+		if (config.benchmark_ranking)
+			for (unsigned j = i; j < targets.size(); ++j)
+				targets[j].outranked = true;
+		else
+			targets.erase(targets.begin() + i, targets.end());
+
 	}
 	else {
-		size_t min_idx = std::min(targets.size(), (size_t)config.max_alignments);
-		score = int((double)targets[min_idx - 1].filter_score * ratio);
-	}
 
-	unsigned i = 0;
-	for (; i < targets.size(); ++i)
-		if (targets[i].filter_score < score)
-			break;
-	
-	if (config.benchmark_ranking)
-		for (unsigned j = i; j < targets.size(); ++j)
-			targets[j].outranked = true;
-	else
-		targets.erase(targets.begin() + i, targets.end());
+		int i = 0;
+		const double p = config.query_overlap_culling / 100.0;
+		while (i < (int)targets.size()) {
+			int min_score = int((double)targets[i].filter_score / (1.0 - config.toppercent / 100.0) / ratio);
+			if (targets[i].is_enveloped(targets.begin(), targets.begin() + i, p, min_score))
+				targets.erase(targets.begin() + i, targets.begin() + i + 1);
+			else
+				++i;
+		}
+
+	}
 }
 
 bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat)
 {
-	if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0 && config.load_balancing == Config::target_parallel) {
-		for (unsigned i = unaligned_from; i < query_id; ++i) {
-			output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), buffer, true);
-			output_format->print_query_epilog(buffer, true);
-		}
-	}
-
 	std::sort(targets.begin(), targets.end(), Target::compare);
 
 	unsigned n_hsp = 0, n_target_seq = 0, hit_hsps = 0;
-	const unsigned top_score = targets.empty() ? 0 : targets[0].filter_score, query_len = (unsigned)query_seq(0).length();
+	auto_ptr<Target_culling> target_culling(Target_culling::get(targets.empty() ? 0 : &targets[0]));
+	const unsigned query_len = (unsigned)query_seq(0).length();
 	size_t seek_pos = 0;
 	const char *query_title = query_ids::get()[query_id].c_str();
+	auto_ptr<Output_format> f(output_format->clone());
 
 	for (size_t i = 0; i < targets.size(); ++i) {
 		if ((config.min_bit_score == 0 && score_matrix.evalue(targets[i].filter_score, config.db_size, query_len) > config.max_evalue)
 			|| score_matrix.bitscore(targets[i].filter_score) < config.min_bit_score)
 			break;
 
-		if (!config.output_range(n_target_seq, targets[i].filter_score, top_score))
+		const int c = target_culling->cull(targets[i]);
+		if (c == Target_culling::next)
+			continue;
+		else if (c == Target_culling::finished)
 			break;
 
 		if (targets[i].outranked)
@@ -202,15 +217,15 @@ bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat)
 			}
 			else {
 				if (n_hsp == 0) {
-					if (*output_format == Output_format::daa)
+					if (*f == Output_format::daa)
 						seek_pos = write_daa_query_record(buffer, query_title, align_mode.query_translated ? query_source_seqs::get()[query_id] : query_seqs::get()[query_id]);
 					else
-						output_format->print_query_intro(query_id, query_title, source_query_len, buffer, false);
+						f->print_query_intro(query_id, query_title, source_query_len, buffer, false);
 				}
-				if (*output_format == Output_format::daa)
+				if (*f == Output_format::daa)
 					write_daa_record(buffer, *j, query_id, targets[i].subject_id);
 				else
-					output_format->print_match(Hsp_context(*j,
+					f->print_match(Hsp_context(*j,
 						query_id,
 						query_seq(j->frame),
 						query_source_seq(),
@@ -223,8 +238,10 @@ bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat)
 						hit_hsps), buffer);
 			}
 
-			if(hit_hsps == 0)
+			if (hit_hsps == 0) {
 				++n_target_seq;
+				target_culling->add(&targets[i]);
+			}
 			++n_hsp;
 			++hit_hsps;
 			if (config.alignment_traceback && j->gap_openings > 0)
@@ -235,17 +252,17 @@ bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat)
 
 	if (n_hsp > 0) {
 		if (!blocked_processing) {
-			if (*output_format == Output_format::daa)
+			if (*f == Output_format::daa)
 				finish_daa_query_record(buffer, seek_pos);
 			else
-				output_format->print_query_epilog(buffer, false);
+				f->print_query_epilog(buffer, query_title, false);
 		}
 		else
 			Intermediate_record::finish_query(buffer, seek_pos);
 	}
-	else if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0) {
-		output_format->print_query_intro(query_id, query_title, source_query_len, buffer, true);
-		output_format->print_query_epilog(buffer, true);
+	else if (!blocked_processing && *f != Output_format::daa && config.report_unaligned != 0) {
+		f->print_query_intro(query_id, query_title, source_query_len, buffer, true);
+		f->print_query_epilog(buffer, query_title, true);
 	}
 
 	if (!blocked_processing) {
diff --git a/src/align/query_mapper.h b/src/align/query_mapper.h
index 5da19c3..bbcd8b8 100644
--- a/src/align/query_mapper.h
+++ b/src/align/query_mapper.h
@@ -34,6 +34,9 @@ using std::list;
 
 struct Target
 {
+	Target(int score):
+		filter_score(score)
+	{}
 	Target(size_t begin, unsigned subject_id) :
 		subject_id(subject_id),
 		filter_score(0),
@@ -44,6 +47,14 @@ struct Target
 	{
 		return lhs->filter_score > rhs->filter_score;
 	}
+	void fill_source_ranges(size_t query_len)
+	{
+		for (list<Hsp_traits>::iterator i = ts.begin(); i != ts.end(); ++i)
+			i->query_source_range = align_mode.query_translated ? untranslate_range(i->query_range, i->frame, query_len) : i->query_range;
+	}
+	bool envelopes(const Hsp_traits &t, double p) const;
+	bool is_enveloped(const Target &t, double p) const;
+	bool is_enveloped(Ptr_vector<Target>::const_iterator begin, Ptr_vector<Target>::const_iterator end, double p, int min_score) const;
 	unsigned subject_id;
 	int filter_score;
 	float filter_time;
@@ -78,6 +89,11 @@ struct Query_mapper
 	{
 		return query_seqs::get()[query_id*align_mode.query_contexts + frame];
 	}
+	void fill_source_ranges()
+	{
+		for (size_t i = 0; i < targets.size(); ++i)
+			targets[i].fill_source_ranges(source_query_len);
+	}
 	pair<Trace_pt_list::iterator, Trace_pt_list::iterator> source_hits;
 	unsigned query_id, targets_finished, next_target;
 private:
diff --git a/src/basic/basic.cpp b/src/basic/basic.cpp
index aa6fab3..5455ea3 100644
--- a/src/basic/basic.cpp
+++ b/src/basic/basic.cpp
@@ -24,9 +24,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "sequence.h"
 #include "masking.h"
 
-const char* Const::version_string = "0.9.8";
+const char* Const::version_string = "0.9.10";
 const char* Const::program_name = "diamond";
-const char* Const::id_delimiters = " \a\b\f\n\r\t\v";
+const char* Const::id_delimiters = " \a\b\f\n\r\t\v\1";
 
 Value_traits::Value_traits(const char *alphabet, Letter mask_char, const char *ignore) :
 	alphabet(alphabet),
diff --git a/src/basic/config.cpp b/src/basic/config.cpp
index bb3fe7a..edbace2 100644
--- a/src/basic/config.cpp
+++ b/src/basic/config.cpp
@@ -56,7 +56,8 @@ Config::Config(int argc, const char **argv)
 		.add_command("model-seqs", "")
 		.add_command("opt", "")
 		.add_command("mask", "")
-		.add_command("fastq2fasta", "");
+		.add_command("fastq2fasta", "")
+		.add_command("dbinfo", "");
 
 	Options_group general("General options");
 	general.add()
@@ -111,10 +112,12 @@ Config::Config(int argc, const char **argv)
 	Options_group aligner("Aligner options");
 	aligner.add()
 		("query", 'q', "input query file", query_file)
+		("strand", 0, "query strands to search (both/minus/plus)", query_strands, string("both"))
 		("un", 0, "file for unaligned queries", unaligned)
 		("unal", 0, "report unaligned queries (0=no, 1=yes)", report_unaligned, -1)
 		("max-target-seqs", 'k', "maximum number of target sequences to report alignments for", max_alignments, uint64_t(25))
 		("top", 0, "report alignments within this percentage range of top alignment score (overrides --max-target-seqs)", toppercent, 100.0)
+		("overlap-culling", 0, "delete hits only if a higher scoring hit envelops the given percentage of the query range", query_overlap_culling)
 		("compress", 0, "compression for output files (0=none, 1=gzip)", compression)
 		("evalue", 'e', "maximum e-value to report alignments (default=0.001)", max_evalue, 0.001)
 		("min-score", 0, "minimum bit score to report alignments (overrides e-value setting)", min_bit_score)
@@ -137,8 +140,10 @@ Config::Config(int argc, const char **argv)
 		//("seg", 0, "enable SEG masking of queries (yes/no)", seg)
 		("query-gencode", 0, "genetic code to use to translate query (see user manual)", query_gencode, 1u)
 		("salltitles", 0, "include full subject titles in DAA file", salltitles)
+		("sallseqid", 0, "include all subject ids in DAA file", sallseqid)
 		("no-self-hits", 0, "suppress reporting of identical self hits", no_self_hits)
-		("taxonmap", 0, "protein accession to taxid mapping file", prot_accession2taxid);
+		("taxonmap", 0, "protein accession to taxid mapping file", prot_accession2taxid)
+		("taxonnodes", 0, "taxonomy nodes.dmp from NCBI", nodesdmp);
 
 	Options_group advanced("Advanced options");
 	advanced.add()
@@ -230,7 +235,7 @@ Config::Config(int argc, const char **argv)
 			throw std::runtime_error("Invalid option: --block-size/-b. Block size is set for the alignment commands.");
 		break;
 	case Config::blastp:
-	case Config::blastx:	
+	case Config::blastx:
 		if (database == "")
 			throw std::runtime_error("Missing parameter: database file (--db/-d)");
 		if (daa_file.length() > 0) {
@@ -254,6 +259,12 @@ Config::Config(int argc, const char **argv)
 		;
 	}
 
+	switch (command) {
+	case Config::dbinfo:
+		if (database == "")
+			throw std::runtime_error("Missing parameter: database file (--db/-d)");
+	}
+
 	if (hit_cap != 0)
 		throw std::runtime_error("Deprecated parameter: --max-hits/-C.");
 
@@ -353,14 +364,6 @@ Config::Config(int argc, const char **argv)
 #ifdef __POPCNT__
 		verbose_stream << "POPCNT enabled." << endl;
 #endif
-
-		message_stream << "#Target sequences to report alignments for: ";
-		if (max_alignments == 0) {
-			max_alignments = std::numeric_limits<uint64_t>::max();
-			message_stream << "unlimited" << endl;
-		}
-		else
-			message_stream << max_alignments << endl;
 	}
 
 	Translator::init(query_gencode);
@@ -371,6 +374,9 @@ Config::Config(int argc, const char **argv)
 	if (command == help)
 		parser.print_help();
 
+	if (query_strands != "both" && query_strands != "minus" && query_strands != "plus")
+		throw std::runtime_error("Invalid value for parameter --strand");
+
 	/*log_stream << "sizeof(hit)=" << sizeof(hit) << " sizeof(packed_uint40_t)=" << sizeof(packed_uint40_t)
 		<< " sizeof(sorted_list::entry)=" << sizeof(sorted_list::entry) << endl;*/
 }
diff --git a/src/basic/config.h b/src/basic/config.h
index 0b1035b..a348d46 100644
--- a/src/basic/config.h
+++ b/src/basic/config.h
@@ -139,10 +139,14 @@ struct Config
 	double score_ratio;
 	bool small_query;
 	bool hashed_seeds;
+	string nodesdmp;
+	bool sallseqid;
+	double query_overlap_culling;
+	string query_strands;
 
 	enum {
 		makedb = 0, blastp = 1, blastx = 2, view = 3, help = 4, version = 5, getseq = 6, benchmark = 7, random_seqs = 8, compare = 9, sort = 10, roc = 11, db_stat = 12, model_sim = 13,
-		match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta=18
+		match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta=18, dbinfo=19
 	};
 	unsigned	command;
 
diff --git a/src/basic/const.h b/src/basic/const.h
index 0dd7c7a..a2107ba 100644
--- a/src/basic/const.h
+++ b/src/basic/const.h
@@ -23,7 +23,7 @@ struct Const
 {
 
 	enum {
-		build_version = 109,
+		build_version = 111,
 		daa_version = 0,
 		seedp_bits = 10,
 		seedp = 1<<seedp_bits,
diff --git a/src/basic/hssp.cpp b/src/basic/hssp.cpp
index d0134d9..579523e 100644
--- a/src/basic/hssp.cpp
+++ b/src/basic/hssp.cpp
@@ -18,6 +18,21 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 #include "../align/align.h"
 
+interval untranslate_range(const interval &r, unsigned frame, size_t len)
+{
+	signed f = frame <= 2 ? frame + 1 : 2 - frame;
+	interval s;
+	if (f > 0) {
+		s.begin_ = (f - 1) + 3 * r.begin_;
+		s.end_ = s.begin_ + 3 * r.length();
+	}
+	else {
+		s.end_ = (int)len + f - 3 * r.begin_ + 1;
+		s.begin_ = s.end_ - 3 * r.length();
+	}
+	return s;
+}
+
 bool Hsp_data::pass_through(const Diagonal_segment &d) const
 {
 	if (intersect(d.query_range(), query_range).length() != d.len
@@ -80,19 +95,8 @@ void Hsp_data::set_source_range(unsigned frame, unsigned dna_len)
 	this->frame = frame;
 	if (!align_mode.query_translated)
 		query_source_range = query_range;
-	else {
-		signed f = frame <= 2 ? frame + 1 : 2 - frame;
-		if (f > 0) {
-			query_source_range.begin_ = (f - 1) + 3 * query_range.begin_;
-			query_source_range.end_ = query_source_range.begin_ + 3 * query_range.length();
-			//query_end_dna = (f-1) + 3 * (l.query_begin_+l.query_len_-1) + 3;
-		}
-		else {
-			query_source_range.end_ = dna_len + f - 3 * query_range.begin_ + 1;
-			//query_end_dna = dna_len + (f + 1) - 3 * (l.query_begin_+l.query_len_-1) - 2;
-			query_source_range.begin_ = query_source_range.end_ - 3 * query_range.length();
-		}
-	}
+	else
+		query_source_range = untranslate_range(query_range, frame, dna_len);
 }
 
 Hsp_context& Hsp_context::parse()
diff --git a/src/basic/match.h b/src/basic/match.h
index 555ecb6..991d40d 100644
--- a/src/basic/match.h
+++ b/src/basic/match.h
@@ -38,6 +38,8 @@ inline interval normalized_range(unsigned pos, int len, Strand strand)
 			: interval (pos + 1 + len, pos + 1);
 }
 
+interval untranslate_range(const interval &r, unsigned frame, size_t l);
+
 struct Diagonal_segment
 {
 	Diagonal_segment():
diff --git a/src/basic/sequence.h b/src/basic/sequence.h
index 349a9de..b52f3fb 100644
--- a/src/basic/sequence.h
+++ b/src/basic/sequence.h
@@ -100,8 +100,13 @@ struct sequence
 	}
 	std::ostream& print(std::ostream &os, const Value_traits &v) const
 	{
-		for (unsigned i = 0; i<len_; ++i)
-			os << v.alphabet[(long)data_[i]];
+		for (unsigned i = 0; i < len_; ++i) {
+			long l = (long)data_[i];
+			if ((l & 128) == 0)
+				os << v.alphabet[l];
+			else
+				os << (char)tolower(v.alphabet[l & 127]);
+		}
 		return os;
 	}
 	std::ostream& print(std::ostream &os, const Value_traits &v, Reversed) const
diff --git a/src/basic/value.h b/src/basic/value.h
index 5f7a256..58467dc 100644
--- a/src/basic/value.h
+++ b/src/basic/value.h
@@ -76,7 +76,7 @@ private:
 
 struct Value_traits
 {
-	Value_traits(const char *alphabet, Letter mask_char, const char *ignore);	
+	Value_traits(const char *alphabet, Letter mask_char, const char *ignore);
 	const char *alphabet;
 	unsigned alphabet_size;
 	Letter mask_char;
diff --git a/src/data/load_seqs.h b/src/data/load_seqs.h
index a203779..fd67db3 100644
--- a/src/data/load_seqs.h
+++ b/src/data/load_seqs.h
@@ -24,7 +24,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../basic/translate.h"
 #include "../util/seq_file_format.h"
 
-inline size_t push_seq(Sequence_set &ss, Sequence_set** source_seqs, const vector<Letter> &seq)
+inline size_t push_seq(Sequence_set &ss, Sequence_set** source_seqs, const vector<Letter> &seq, unsigned frame_mask)
 {
 	if (config.command == Config::blastp || config.command == Config::makedb || config.command == Config::random_seqs) {
 		ss.push_back(seq);
@@ -42,7 +42,7 @@ inline size_t push_seq(Sequence_set &ss, Sequence_set** source_seqs, const vecto
 
 		unsigned bestFrames(Translator::computeGoodFrames(proteins, config.get_run_len((unsigned)seq.size() / 3)));
 		for (unsigned j = 0; j < 6; ++j) {
-			if (bestFrames & (1 << j))
+			if ((bestFrames & (1 << j)) && (frame_mask & (1 << j)))
 				ss.push_back(proteins[j]);
 			else
 				ss.fill(proteins[j].size(), value_traits.mask_char);
@@ -68,10 +68,16 @@ inline size_t load_seqs(Input_stream &file,
 	vector<char> id;
 	string id2;
 
+	unsigned frame_mask = (1 << 6) - 1;
+	if (config.query_strands == "plus")
+		frame_mask = (1 << 3) - 1;
+	else if (config.query_strands == "minus")
+		frame_mask = ((1 << 3) - 1) << 3;
+
 	while (letters < max_letters && format.get_seq(id, seq, file)) {
 		if (seq.size() > 0 && (filter.empty() || id2.assign(id.data(), id.data() + id.size()).find(filter, 0) != string::npos)) {
 			ids->push_back(id);
-			letters += push_seq(**seqs, source_seqs, seq);
+			letters += push_seq(**seqs, source_seqs, seq, frame_mask);
 			++n;
 			if ((*seqs)->get_length() >(size_t)std::numeric_limits<int>::max())
 				throw std::runtime_error("Number of sequences in file exceeds supported maximum.");
diff --git a/src/data/reference.cpp b/src/data/reference.cpp
index c86caef..4e15766 100644
--- a/src/data/reference.cpp
+++ b/src/data/reference.cpp
@@ -39,6 +39,18 @@ bool blocked_processing;
 using std::cout;
 using std::endl;
 
+string* get_allseqids(const char *s)
+{
+	string *r = new string;
+	const vector<string> t(tokenize(s, "\1"));
+	for (vector<string>::const_iterator i = t.begin(); i != t.end(); ++i) {
+		if (i != t.begin())
+			r->append("\1");
+		r->append(i->substr(0, find_first_of(i->c_str(), Const::id_delimiters)));
+	}
+	return r;
+}
+
 struct Pos_record
 {
 	Pos_record()
@@ -211,4 +223,14 @@ void Database_file::get_seq()
 		seq.clear();
 		id.clear();
 	}
+}
+
+void db_info()
+{
+	Database_file db_file;
+	cout << "Database format version = " << ref_header.db_version << endl;
+	cout << "Diamond build = " << ref_header.build << endl;
+	cout << "Sequences = " << ref_header.sequences << endl;
+	cout << "Letters = " << ref_header.letters << endl;
+	db_file.close();
 }
\ No newline at end of file
diff --git a/src/data/reference.h b/src/data/reference.h
index 7eb8bb5..c74f5e9 100644
--- a/src/data/reference.h
+++ b/src/data/reference.h
@@ -129,6 +129,8 @@ inline size_t max_id_len(const String_set<0> &ids)
 	return max;
 }
 
+string* get_allseqids(const char *s);
+
 struct Ref_map
 {
 	Ref_map():
@@ -157,10 +159,13 @@ struct Ref_map
 			n = next_++;
 			data_[block][i] = n;
 			len_.push_back((uint32_t)ref_seqs::get().length(i));
+			const char *title = ref_ids::get()[i].c_str();
 			if (config.salltitles)
-				name_.push_back(new string(ref_ids::get()[i].c_str()));
+				name_.push_back(new string(title));
+			else if (config.sallseqid)
+				name_.push_back(get_allseqids(title));
 			else
-				name_.push_back(get_str(ref_ids::get()[i].c_str(), Const::id_delimiters));
+				name_.push_back(get_str(title, Const::id_delimiters));
 			mtx_.unlock();
 		}
 		return n;
diff --git a/src/data/taxonomy.cpp b/src/data/taxonomy.cpp
index 838f29f..9172a56 100644
--- a/src/data/taxonomy.cpp
+++ b/src/data/taxonomy.cpp
@@ -17,10 +17,16 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
 #include <stdio.h>
+#include <set>
 #include "taxonomy.h"
 #include "../util/compressed_stream.h"
 #include "../basic/config.h"
 #include "../util/merge_sort.h"
+#include "../util/log_stream.h"
+
+using std::cout;
+using std::endl;
+using std::set;
 
 Taxonomy taxonomy;
 
@@ -28,7 +34,7 @@ string& get_accession(string &t)
 {
 	size_t i;
 	if (t.compare(0, 6, "UniRef") == 0)
-		t.erase(0, 9);
+		t.erase(0, t.find('_', 0) + 1);
 	else if ((i = t.find_first_of('|', 0)) != string::npos) {
 		if (t.compare(0, 3, "gi|") == 0) {
 			t.erase(0, t.find_first_of('|', i + 1) + 1);
@@ -62,5 +68,78 @@ void Taxonomy::load()
 		/*if (f.line_count % 10000 == 0)
 			std::cout << f.line_count << endl;*/
 	}
+	f.close();
 	merge_sort(accession2taxid_.begin(), accession2taxid_.end(), config.threads_);
+}
+
+void Taxonomy::load_nodes()
+{
+	Input_stream f(config.nodesdmp);
+	unsigned taxid, parent;
+	while (!f.eof() && (f.getline(), !f.line.empty())) {
+		if (sscanf(f.line.c_str(), "%u\t|\t%u", &taxid, &parent) != 2)
+			throw std::runtime_error("Invalid nodes.dmp file format.");
+		//cout << taxid << '\t' << parent << endl;
+		parent_.resize(taxid + 1);
+		parent_[taxid] = parent;
+	}
+	f.close();
+}
+
+void Taxonomy::init()
+{
+	task_timer timer;
+	if (!config.prot_accession2taxid.empty()) {
+		timer.go("Loading taxonomy");
+		load();
+		timer.finish();
+	}
+
+	if (!config.nodesdmp.empty()) {
+		timer.go("Loading taxonomy nodes");
+		load_nodes();
+		timer.finish();
+	}
+}
+
+void Taxonomy::get_taxids(const char *id, set<unsigned> &taxons) const
+{
+	const vector<string> t(tokenize(id, "\1"));
+	for (vector<string>::const_iterator i = t.begin(); i < t.end(); ++i) {
+		const unsigned id = get(Taxonomy::Accession(*i));
+		if(id != 0)
+			taxons.insert(id);
+	}
+}
+
+unsigned Taxonomy::get_lca(unsigned t1, unsigned t2) const
+{
+	static const int max = 64;
+	if (t1 == t2 || t2 == 0)
+		return t1;
+	if (t1 == 0)
+		return t2;
+	unsigned p = t2;
+	set<unsigned> l;
+	int n = 0;
+	do {
+		p = get_parent(p);
+		if (p == 0)
+			return t1;
+		l.insert(p);
+		if (++n > max)
+			throw std::runtime_error("Path in taxonomy too long (1).");
+	} while (p != t1 && p != 1);
+	if (p == t1)
+		return p;
+	p = t1;
+	n = 0;
+	while (l.find(p) == l.end()) {
+		p = get_parent(p);
+		if (p == 0)
+			return t2;
+		if (++n > max)
+			throw std::runtime_error("Path in taxonomy too long (2).");
+	}
+	return p;
 }
\ No newline at end of file
diff --git a/src/data/taxonomy.h b/src/data/taxonomy.h
index 1ce6dc5..1a2537f 100644
--- a/src/data/taxonomy.h
+++ b/src/data/taxonomy.h
@@ -19,6 +19,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include <vector>
 #include <algorithm>
 #include <string>
+#include <set>
 #include "../basic/const.h"
 #include "../util/util.h"
 
@@ -71,7 +72,10 @@ struct Taxonomy
 		char s[max_accesion_len];
 	};
 
+	void init();
 	void load();
+	void load_nodes();
+	void get_taxids(const char *s, std::set<unsigned> &taxons) const;
 
 	unsigned get(const Accession &accession) const
 	{
@@ -82,9 +86,20 @@ struct Taxonomy
 			return 0;
 	}
 
+	unsigned get_parent(unsigned taxid) const
+	{
+		if (parent_.size() <= taxid)
+			throw std::runtime_error(string("No taxonomy node found for taxon id ") + to_string(taxid));
+		return parent_[taxid];
+	}
+
+	unsigned get_lca(unsigned t1, unsigned t2) const;
+
 private:
 	
 	std::vector<std::pair<Accession, unsigned> > accession2taxid_;
+	std::vector<unsigned> parent_;
+
 };
 
 extern Taxonomy taxonomy;
\ No newline at end of file
diff --git a/src/dp/dp.h b/src/dp/dp.h
index dbdec70..6a76900 100644
--- a/src/dp/dp.h
+++ b/src/dp/dp.h
@@ -94,6 +94,11 @@ struct Hsp_traits
 		score(0),
 		frame((int)frame)
 	{}
+	Hsp_traits(const interval &query_source_range):
+		query_source_range(query_source_range)
+	{}
+	Hsp_traits(const Hsp_data &hsp)
+	{}
 	int partial_score(const Diagonal_segment &d) const
 	{
 		const double overlap = std::max(d.subject_range().overlap_factor(subject_range), d.query_range().overlap_factor(query_range));
@@ -142,7 +147,7 @@ struct Hsp_traits
 		}
 	};
 	int d_min, d_max, score, frame;
-	interval query_range, subject_range;
+	interval query_source_range, query_range, subject_range;
 };
 
 template<typename _score>
diff --git a/src/dp/swipe.cpp b/src/dp/swipe.cpp
index d09b2ee..9b3e070 100644
--- a/src/dp/swipe.cpp
+++ b/src/dp/swipe.cpp
@@ -246,4 +246,4 @@ void swipe(const sequence &query, vector<sequence>::const_iterator subject_begin
 #ifdef __SSE2__
 	swipe<uint8_t>(query, subject_begin, subject_end, out);
 #endif
-}
+}
\ No newline at end of file
diff --git a/src/extra/match_file.h b/src/extra/match_file.h
index cd2ef69..75252d7 100644
--- a/src/extra/match_file.h
+++ b/src/extra/match_file.h
@@ -21,6 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 #include <stdio.h>
 #include <vector>
+#include "../util/binary_file.h"
 #include "blast_record.h"
 #include "../util/util.h"
 #include "../basic/reduction.h"
diff --git a/src/output/blast_pairwise_format.cpp b/src/output/blast_pairwise_format.cpp
index d4c4b20..1096e97 100644
--- a/src/output/blast_pairwise_format.cpp
+++ b/src/output/blast_pairwise_format.cpp
@@ -18,7 +18,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 #include "output_format.h"
 
-void Pairwise_format::print_match(const Hsp_context& r, Text_buffer &out) const
+void Pairwise_format::print_match(const Hsp_context& r, Text_buffer &out)
 {
 	static const unsigned width = 60;
 	out << '>';
@@ -67,9 +67,8 @@ void Pairwise_format::print_footer(Output_stream &out) const
 
 }
 
-void Pairwise_format::print_query_epilog(Text_buffer &out, bool unaligned) const
+void Pairwise_format::print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const
 {
-
 }
 
 void Pairwise_format::print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const
diff --git a/src/output/blast_tab_format.cpp b/src/output/blast_tab_format.cpp
index a85147a..59fafd3 100644
--- a/src/output/blast_tab_format.cpp
+++ b/src/output/blast_tab_format.cpp
@@ -95,17 +95,17 @@ Blast_tab_format::Blast_tab_format() :
 
 void print_staxids(Text_buffer &out, const char *id)
 {
-	const vector<string> t(tokenize(id, "\1"));	
 	std::set<unsigned> taxons;
-	for (vector<string>::const_iterator i = t.begin(); i < t.end(); ++i)
-		taxons.insert(taxonomy.get(Taxonomy::Accession(*i)));
+	taxonomy.get_taxids(id, taxons);
+	if (taxons.empty())
+		return;
 	std::set<unsigned>::const_iterator i = taxons.begin();
 	out << *(i++);
 	for (; i != taxons.end(); ++i)
 		out << ';' << *i;
 }
 
-void Blast_tab_format::print_match(const Hsp_context& r, Text_buffer &out) const
+void Blast_tab_format::print_match(const Hsp_context& r, Text_buffer &out)
 {
 	for (vector<unsigned>::const_iterator i = fields.begin(); i != fields.end(); ++i) {
 		switch (*i) {
diff --git a/src/output/join_blocks.cpp b/src/output/join_blocks.cpp
index 0b2e63e..353e06b 100644
--- a/src/output/join_blocks.cpp
+++ b/src/output/join_blocks.cpp
@@ -21,6 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../data/queries.h"
 #include "../output/daa_write.h"
 #include "output_format.h"
+#include "../align/query_mapper.h"
 
 struct Join_fetcher
 {
@@ -123,7 +124,35 @@ struct Join_record
 
 };
 
-void join_query(vector<Binary_buffer> &buf, Text_buffer &out, Statistics &statistics, unsigned query, const char *query_name, unsigned query_source_len)
+struct Join_records
+{
+	Join_records(vector<Binary_buffer> &buf)
+	{
+		for (unsigned i = 0; i < current_ref_block; ++i) {
+			it.push_back(buf[i].begin());
+			Join_record::push_next(i, std::numeric_limits<unsigned>::max(), it.back(), records);
+		}
+		std::make_heap(records.begin(), records.end());
+	}
+	Target* get_target()
+	{
+		if (records.empty())
+			return 0;
+		const unsigned block = records.front().block_;
+		const unsigned subject = records.front().info_.subject_id;
+		Target *t = new Target(records.front().info_.score);
+		do {
+			std::pop_heap(records.begin(), records.end());
+			records.pop_back();
+			if (Join_record::push_next(block, subject, it[block], records))
+				std::push_heap(records.begin(), records.end());
+		} while (!records.empty() && records.front().info_.subject_id == subject);
+	}
+	vector<Join_record> records;
+	vector<Binary_buffer::Iterator> it;
+};
+
+void join_query(vector<Binary_buffer> &buf, Text_buffer &out, Statistics &statistics, unsigned query, const char *query_name, unsigned query_source_len, Output_format &f)
 {
 	sequence context[6];
 	for (unsigned i = 0; i < align_mode.query_contexts; ++i)
@@ -148,11 +177,11 @@ void join_query(vector<Binary_buffer> &buf, Text_buffer &out, Statistics &statis
 		if (config.output_range(n_target_seq, next.info_.score, top_score) || same_subject) {
 			//printf("q=%u s=%u n=%u ss=%u\n",query, next.info_.subject_id, n_target_seq, same_subject, next.info_.score);
 
-			if(*output_format == Output_format::daa)
+			if(f == Output_format::daa)
 				write_daa_record(out, next.info_);
 			else {
 				Hsp_data hsp(next.info_, query_source_len);
-				output_format->print_match(Hsp_context(hsp,
+				f.print_match(Hsp_context(hsp,
 					query,
 					context[align_mode.check_context(hsp.frame)],
 					align_mode.query_translated ? query_source_seqs::get()[query] : context[0],
@@ -205,21 +234,23 @@ void join_worker(Task_queue<Text_buffer,Join_writer> *queue)
 		if (*output_format != Output_format::daa && config.report_unaligned != 0) {
 			for (unsigned i = fetcher.unaligned_from; i < fetcher.query_id; ++i) {
 				output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), *out, true);
-				output_format->print_query_epilog(*out, true);
+				output_format->print_query_epilog(*out, query_ids::get()[i].c_str(), true);
 			}
 		}
 
-		if (*output_format == Output_format::daa)
+		auto_ptr<Output_format> f(output_format->clone());
+
+		if (*f == Output_format::daa)
 			seek_pos = write_daa_query_record(*out, query_name, query_seq);
 		else
-			output_format->print_query_intro(fetcher.query_id, query_name, (unsigned)query_seq.length(), *out, false);
+			f->print_query_intro(fetcher.query_id, query_name, (unsigned)query_seq.length(), *out, false);
 
-		join_query(fetcher.buf, *out, stat, fetcher.query_id, query_name, (unsigned)query_seq.length());
+		join_query(fetcher.buf, *out, stat, fetcher.query_id, query_name, (unsigned)query_seq.length(), *f);
 
-		if (*output_format == Output_format::daa)
+		if (*f == Output_format::daa)
 			finish_daa_query_record(*out, seek_pos);
 		else
-			output_format->print_query_epilog(*out, false);
+			f->print_query_epilog(*out, query_name, false);
 
 		queue->push(n);
 	}
@@ -242,7 +273,7 @@ void join_blocks(unsigned ref_blocks, Output_stream &master_out, const vector<Te
 		Text_buffer out;
 		for (unsigned i = Join_fetcher::query_last + 1; i < query_ids::get().get_length(); ++i) {
 			output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), out, true);
-			output_format->print_query_epilog(out, true);
+			output_format->print_query_epilog(out, query_ids::get()[i].c_str(), true);
 		}
 		writer(out);
 	}
diff --git a/src/output/output.h b/src/output/output.h
index 03fe1fe..8bef7f5 100644
--- a/src/output/output.h
+++ b/src/output/output.h
@@ -19,6 +19,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #ifndef OUTPUT_H_
 #define OUTPUT_H_
 
+#include <map>
 #include "../util/binary_file.h"
 #include "../basic/packed_transcript.h"
 #include "../util/text_buffer.h"
@@ -98,8 +99,42 @@ struct Intermediate_record
 	Packed_transcript transcript;
 };
 
-#ifndef ST_JOIN
 void join_blocks(unsigned ref_blocks, Output_stream &master_out, const vector<Temp_file> &tmp_file);
-#endif
+
+struct Output_sink
+{
+	Output_sink(size_t begin, Output_stream *f) :
+		f_(f),
+		next_(begin),
+		size_(0),
+		max_size_(0)
+	{}
+	void push(size_t n, Text_buffer *buf);
+	size_t size() const
+	{
+		return size_;
+	}
+	size_t max_size() const
+	{
+		return max_size_;
+	}
+	static Output_sink& get()
+	{
+		return *instance;
+	}
+	size_t next() const
+	{
+		return next_;
+	}
+	static auto_ptr<Output_sink> instance;
+private:
+	void flush(Text_buffer *buf);
+	tthread::mutex mtx_;
+	Output_stream* const f_;
+	std::map<size_t, Text_buffer*> backlog_;
+	size_t next_, size_, max_size_;
+};
+
+void heartbeat_worker(size_t qend);
 
 #endif
\ No newline at end of file
diff --git a/src/output/output_format.cpp b/src/output/output_format.cpp
index 74786f6..ef6113b 100644
--- a/src/output/output_format.cpp
+++ b/src/output/output_format.cpp
@@ -84,12 +84,31 @@ Output_format* get_output_format()
 		return new Pairwise_format;
 	else if (f[0] == "null")
 		return new Null_format;
+	else if (f[0] == "102")
+		return new Taxon_format;
 	else
-		throw std::runtime_error("Invalid output format. Allowed values: 5,6,100,101");
+		throw std::runtime_error("Invalid output format. Allowed values: 0,5,6,100,101,102");
 }
 
+void init_output()
+{
+	output_format = auto_ptr<Output_format>(get_output_format());
+	if (*output_format == Output_format::taxon && config.toppercent == 100.0)
+		config.toppercent = 10.0;
+	if (config.toppercent == 100.0) {
+		message_stream << "#Target sequences to report alignments for: ";
+		if (config.max_alignments == 0) {
+			config.max_alignments = std::numeric_limits<uint64_t>::max();
+			message_stream << "unlimited" << endl;
+		}
+		else
+			message_stream << config.max_alignments << endl;
+	}
+	else
+		message_stream << "Percentage range of top alignment score to report hits: " << config.toppercent << endl;
+}
 
-void XML_format::print_match(const Hsp_context &r, Text_buffer &out) const
+void XML_format::print_match(const Hsp_context &r, Text_buffer &out)
 {
 	if(r.hsp_num == 0) {
 		if (r.hit_num > 0)
@@ -183,7 +202,7 @@ void XML_format::print_query_intro(size_t query_num, const char *query_name, uns
 		<< "<Iteration_hits>" << '\n';
 }
 
-void XML_format::print_query_epilog(Text_buffer &out, bool unaligned) const
+void XML_format::print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const
 {
 	if (!unaligned) {
 		out << "  </Hit_hsps>" << '\n'
diff --git a/src/output/output_format.h b/src/output/output_format.h
index 4ed34a5..15880de 100644
--- a/src/output/output_format.h
+++ b/src/output/output_format.h
@@ -34,14 +34,15 @@ struct Output_format
 	{}
 	virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const
 	{}
-	virtual void print_query_epilog(Text_buffer &out, bool unaligned) const
+	virtual void print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const
 	{}
-	virtual void print_match(const Hsp_context& r, Text_buffer &out) const
+	virtual void print_match(const Hsp_context& r, Text_buffer &out)
 	{}
 	virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const
 	{ }
 	virtual void print_footer(Output_stream &f) const
 	{ }
+	virtual Output_format* clone() const = 0;
 	virtual ~Output_format()
 	{ }
 	static void print_title(Text_buffer &buf, const char *id, bool full_titles, bool all_titles, const char *separator);
@@ -50,7 +51,7 @@ struct Output_format
 		return code;
 	}
 	unsigned code;
-	enum { daa, blast_tab, blast_xml, sam, blast_pairwise, null };
+	enum { daa, blast_tab, blast_xml, sam, blast_pairwise, null, taxon };
 };
 
 extern auto_ptr<Output_format> output_format;
@@ -60,6 +61,10 @@ struct Null_format : public Output_format
 	Null_format() :
 		Output_format(null)
 	{}
+	virtual Output_format* clone() const
+	{
+		return new Null_format(*this);
+	}
 };
 
 struct DAA_format : public Output_format
@@ -67,6 +72,10 @@ struct DAA_format : public Output_format
 	DAA_format():
 		Output_format(daa)
 	{}
+	virtual Output_format* clone() const
+	{
+		return new DAA_format(*this);
+	}
 };
 
 struct Blast_tab_format : public Output_format
@@ -74,9 +83,13 @@ struct Blast_tab_format : public Output_format
 	static const char* field_str[];
 	Blast_tab_format();
 	virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
-	virtual void print_match(const Hsp_context& r, Text_buffer &out) const;
+	virtual void print_match(const Hsp_context& r, Text_buffer &out);
 	virtual ~Blast_tab_format()
 	{ }
+	virtual Output_format* clone() const
+	{
+		return new Blast_tab_format(*this);
+	}
 	vector<unsigned> fields;
 };
 
@@ -85,11 +98,15 @@ struct Sam_format : public Output_format
 	Sam_format():
 		Output_format(sam)
 	{ }
-	virtual void print_match(const Hsp_context& r, Text_buffer &out) const;
+	virtual void print_match(const Hsp_context& r, Text_buffer &out);
 	virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const;
 	virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
 	virtual ~Sam_format()
 	{ }
+	virtual Output_format* clone() const
+	{
+		return new Sam_format(*this);
+	}
 };
 
 struct XML_format : public Output_format
@@ -99,13 +116,17 @@ struct XML_format : public Output_format
 	{
 		config.salltitles = true;
 	}
-	virtual void print_match(const Hsp_context &r, Text_buffer &out) const;
+	virtual void print_match(const Hsp_context &r, Text_buffer &out);
 	virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const;
 	virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
-	virtual void print_query_epilog(Text_buffer &out, bool unaligned) const;
+	virtual void print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const;
 	virtual void print_footer(Output_stream &f) const;
 	virtual ~XML_format()
 	{ }
+	virtual Output_format* clone() const
+	{
+		return new XML_format(*this);
+	}
 };
 
 struct Pairwise_format : public Output_format
@@ -115,16 +136,46 @@ struct Pairwise_format : public Output_format
 	{
 		config.salltitles = true;
 	}
-	virtual void print_match(const Hsp_context &r, Text_buffer &out) const;
+	virtual void print_match(const Hsp_context &r, Text_buffer &out);
 	virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const;
 	virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
-	virtual void print_query_epilog(Text_buffer &out, bool unaligned) const;
+	virtual void print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const;
 	virtual void print_footer(Output_stream &f) const;
 	virtual ~Pairwise_format()
 	{ }
+	virtual Output_format* clone() const
+	{
+		return new Pairwise_format(*this);
+	}
+};
+
+struct Taxon_format : public Output_format
+{
+	Taxon_format() :
+		Output_format(taxon),
+		taxid(0),
+		evalue(std::numeric_limits<double>::max())
+	{
+		config.salltitles = true;
+		if (config.prot_accession2taxid.empty())
+			throw std::runtime_error("Output format requires setting the --taxonmap parameter.");
+		if (config.nodesdmp.empty())
+			throw std::runtime_error("Output format requires setting the --taxonnodes parameter.");
+	}
+	virtual void print_match(const Hsp_context &r, Text_buffer &out);
+	virtual void print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const;
+	virtual ~Taxon_format()
+	{ }
+	virtual Output_format* clone() const
+	{
+		return new Taxon_format(*this);
+	}
+	unsigned taxid;
+	double evalue;
 };
 
 Output_format* get_output_format();
+void init_output();
 void print_hsp(Hsp_data &hsp, sequence query);
 
 #endif /* OUTPUT_FORMAT_H_ */
diff --git a/src/output/output_sink.cpp b/src/output/output_sink.cpp
new file mode 100644
index 0000000..d4e8f8b
--- /dev/null
+++ b/src/output/output_sink.cpp
@@ -0,0 +1,86 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "output.h"
+#include "../data/queries.h"
+
+using std::map;
+using std::auto_ptr;
+
+auto_ptr<Output_sink> Output_sink::instance;
+
+void Output_sink::push(size_t n, Text_buffer *buf)
+{
+	mtx_.lock();
+	//cout << "n=" << n << " next=" << next_ << endl;
+	if (n != next_) {
+		backlog_[n] = buf;
+		size_ += buf ? buf->alloc_size() : 0;
+		max_size_ = std::max(max_size_, size_);
+		mtx_.unlock();
+	}
+	else
+		flush(buf);
+}
+
+void Output_sink::flush(Text_buffer *buf)
+{
+	size_t n = next_ + 1;
+	vector<Text_buffer*> out;
+	out.push_back(buf);
+	map<size_t, Text_buffer*>::iterator i;
+	do {
+		while ((i = backlog_.begin()) != backlog_.end() && i->first == n) {
+			out.push_back(i->second);
+			backlog_.erase(i);
+			++n;
+		}
+		mtx_.unlock();
+		size_t size = 0;
+		for (vector<Text_buffer*>::iterator j = out.begin(); j < out.end(); ++j) {
+			if (*j) {
+				f_->write((*j)->get_begin(), (*j)->size());
+				if (*j != buf)
+					size += (*j)->alloc_size();
+				delete *j;
+			}
+		}
+		out.clear();
+		mtx_.lock();
+		size_ -= size;
+	} while ((i = backlog_.begin()) != backlog_.end() && i->first == n);
+	next_ = n;
+	mtx_.unlock();
+}
+
+void heartbeat_worker(size_t qend)
+{
+	static const int interval = 100;
+	int n = 0;
+	while (Output_sink::get().next() < qend) {
+		if (n == interval) {
+			const string title(query_ids::get()[Output_sink::get().next()].c_str());
+			verbose_stream << "Queries=" << Output_sink::get().next() << " size=" << megabytes(Output_sink::get().size()) << " max_size=" << megabytes(Output_sink::get().max_size())
+				<< " next=" << title.substr(0, title.find(' ')) << endl;
+			n = 0;
+		}
+		else
+			++n;
+		tthread::this_thread::sleep_for(tthread::chrono::milliseconds(10));
+	}
+}
\ No newline at end of file
diff --git a/src/output/sam_format.cpp b/src/output/sam_format.cpp
index 932e406..d8a4469 100644
--- a/src/output/sam_format.cpp
+++ b/src/output/sam_format.cpp
@@ -82,7 +82,7 @@ void Sam_format::print_query_intro(size_t query_num, const char *query_name, uns
 	}
 }
 
-void Sam_format::print_match(const Hsp_context& r, Text_buffer &out) const
+void Sam_format::print_match(const Hsp_context& r, Text_buffer &out)
 {
 	out.write_until(r.query_name, Const::id_delimiters);
 	out << '\t' << '0' << '\t';
diff --git a/src/basic/const.h b/src/output/target_culling.cpp
similarity index 62%
copy from src/basic/const.h
copy to src/output/target_culling.cpp
index 0dd7c7a..52df2fe 100644
--- a/src/basic/const.h
+++ b/src/output/target_culling.cpp
@@ -16,31 +16,9 @@ You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
-#ifndef CONST_H_
-#define CONST_H_
+#include "target_culling.h"
 
-struct Const
+Target_culling* Target_culling::get(const Target* first)
 {
-
-	enum {
-		build_version = 109,
-		daa_version = 0,
-		seedp_bits = 10,
-		seedp = 1<<seedp_bits,
-		max_seed_weight = 32,
-		max_shapes = 16,
-		max_shape_len = 32
-	};
-
-	static const char* version_string;
-	static const char* program_name;
-	static const char* id_delimiters;
-
-};
-
-#define SIMPLE_SEARCH
-// #define FREQUENCY_MASKING
-// #define ST_JOIN
-// #define NO_COLLISION_FILTER
-
-#endif /* CONST_H_ */
+	return config.query_overlap_culling == 0.0 ? new Target_culling(first) : new Overlap_culling;
+}
\ No newline at end of file
diff --git a/src/output/target_culling.h b/src/output/target_culling.h
new file mode 100644
index 0000000..365b548
--- /dev/null
+++ b/src/output/target_culling.h
@@ -0,0 +1,73 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#ifndef TARGET_CULLING_H_
+#define TARGET_CULLING_H_
+
+#include <vector>
+#include "../align/query_mapper.h"
+
+struct Target_culling
+{
+	Target_culling():
+		top_score_(0)
+	{}
+	Target_culling(const Target *first):
+		n_(0),
+		top_score_(first ? first->filter_score : 0)
+	{}
+	virtual int cull(const Target &t)
+	{
+		return config.output_range((unsigned)n_, t.filter_score, top_score_) ? use : finished;
+	}
+	virtual void add(Target *t)
+	{
+		++n_;
+	}
+	enum { finished = 0, next = 1, use = 2};
+	static Target_culling* get(const Target* first);
+private:
+	size_t n_;
+	const int top_score_;
+};
+
+struct Overlap_culling : Target_culling
+{
+	Overlap_culling()
+	{}
+	virtual int cull(const Target &t)
+	{
+		return t.is_enveloped(targets_.begin(), targets_.end(), config.query_overlap_culling / 100.0, int(double(t.filter_score) / (1.0 - config.toppercent / 100.0)))
+			? next : use;
+	}
+	virtual void add(Target *t)
+	{
+		targets_.push_back(t);
+	}
+	virtual void add(unsigned n, const Hsp_data &hsp)
+	{
+		if (n >= targets_.size()) {
+			targets_.push_back(new Target(hsp.score));
+		}
+		targets_[n]->ts.push_back(Hsp_traits(hsp));
+	}
+private:
+	std::vector<Target*> targets_;
+};
+
+#endif
\ No newline at end of file
diff --git a/src/output/taxon_format.cpp b/src/output/taxon_format.cpp
new file mode 100644
index 0000000..5979ae4
--- /dev/null
+++ b/src/output/taxon_format.cpp
@@ -0,0 +1,51 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include <set>
+#include "output_format.h"
+#include "../data/taxonomy.h"
+
+using std::set;
+
+void Taxon_format::print_match(const Hsp_context &r, Text_buffer &out)
+{
+	set<unsigned> taxons;
+	taxonomy.get_taxids(r.subject_name, taxons);
+	if (taxons.empty())
+		return;
+	evalue = std::min(evalue, r.evalue());
+	try {
+		for (set<unsigned>::const_iterator i = taxons.begin(); i != taxons.end(); ++i)
+			taxid = taxonomy.get_lca(taxid, *i);
+	}
+	catch (std::exception &) {
+		std::cerr << "Query=" << r.query_name << endl << "Subject=" << r.subject_name << endl;
+		throw;
+	}
+}
+
+void Taxon_format::print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const
+{
+	out.write_until(query_title, Const::id_delimiters);
+	out << '\t' << taxid << '\t';
+	if (taxid != 0)
+		out.print_e(evalue);
+	else
+		out << '0';
+	out << '\n';
+}
\ No newline at end of file
diff --git a/src/output/view.h b/src/output/view.cpp
similarity index 73%
rename from src/output/view.h
rename to src/output/view.cpp
index 21a2ba6..2a66b01 100644
--- a/src/output/view.h
+++ b/src/output/view.cpp
@@ -16,9 +16,6 @@ You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
-#ifndef VIEW_H_
-#define VIEW_H_
-
 #include "../basic/config.h"
 #include "../util/binary_file.h"
 #include "../util/text_buffer.h"
@@ -28,13 +25,14 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../util/task_queue.h"
 #include "../basic/score_matrix.h"
 #include "../util/thread.h"
+#include "../data/taxonomy.h"
 
 const unsigned view_buf_size = 32;
 
 struct View_writer
 {
-	View_writer():
-		f_ (config.compression == 1
+	View_writer() :
+		f_(config.compression == 1
 			? new Compressed_ostream(config.output_file)
 			: new Output_stream(config.output_file))
 	{ }
@@ -52,17 +50,18 @@ struct View_writer
 
 struct View_fetcher
 {
-	View_fetcher(DAA_file &daa):
-		daa (daa)
+	View_fetcher(DAA_file &daa) :
+		daa(daa)
 	{ }
 	bool operator()()
 	{
 		n = 0;
-		for(unsigned i=0;i<view_buf_size;++i)
+		for (unsigned i = 0; i<view_buf_size; ++i)
 			if (!daa.read_query_buffer(buf[i], query_num)) {
 				query_num -= n - 1;
 				return false;
-			} else
+			}
+			else
 				++n;
 		query_num -= n - 1;
 		return true;
@@ -73,63 +72,73 @@ struct View_fetcher
 	DAA_file &daa;
 };
 
-void view_query(DAA_query_record &r, Text_buffer &out, const Output_format &format)
+void view_query(DAA_query_record &r, Text_buffer &out, Output_format &format)
 {
-	format.print_query_intro(r.query_num, r.query_name.c_str(), (unsigned)r.query_len(), out, false);
-	for (DAA_query_record::Match_iterator i = r.begin(); i.good(); ++i) {
+	auto_ptr<Output_format> f(format.clone());
+	f->print_query_intro(r.query_num, r.query_name.c_str(), (unsigned)r.query_len(), out, false);
+	DAA_query_record::Match_iterator i = r.begin();
+	const unsigned top_score = i.good() ? i->score : 0;
+	for (; i.good(); ++i) {
 		if (i->frame > 2 && config.forwardonly)
 			continue;
-		format.print_match(i->context(), out);
+		if (!config.output_range(i->hit_num, i->score, top_score))
+			break;
+		f->print_match(i->context(), out);
 	}
-	format.print_query_epilog(out, false);
+	f->print_query_epilog(out, r.query_name.c_str(), false);
 }
 
 struct View_context
 {
-	View_context(DAA_file &daa, View_writer &writer, const Output_format &format):
-		daa (daa),
-		writer (writer),
-		queue (3*config.threads_, writer),
-		format (format)
+	View_context(DAA_file &daa, View_writer &writer, Output_format &format) :
+		daa(daa),
+		writer(writer),
+		queue(3 * config.threads_, writer),
+		format(format)
 	{ }
 	void operator()(unsigned thread_id)
 	{
 		try {
 			size_t n;
-			View_fetcher query_buf (daa);
+			View_fetcher query_buf(daa);
 			Text_buffer *buffer = 0;
-			while(queue.get(n, buffer, query_buf)) {
+			while (queue.get(n, buffer, query_buf)) {
 				for (unsigned j = 0; j < query_buf.n; ++j) {
 					DAA_query_record r(daa, query_buf.buf[j], query_buf.query_num + j);
 					view_query(r, *buffer, format);
 				}
 				queue.push(n);
 			}
-		} catch(std::exception &e) {
+		}
+		catch (std::exception &e) {
 			std::cout << e.what() << std::endl;
 			std::terminate();
 		}
 	}
 	DAA_file &daa;
 	View_writer &writer;
-	Task_queue<Text_buffer,View_writer> queue;
-	const Output_format &format;
+	Task_queue<Text_buffer, View_writer> queue;
+	Output_format &format;
 };
 
 void view()
 {
-	DAA_file daa (config.daa_file);
+	task_timer timer("Loading subject IDs");
+	DAA_file daa(config.daa_file);
 	score_matrix = Score_matrix("", daa.lambda(), daa.kappa(), daa.gap_open_penalty(), daa.gap_extension_penalty());
+	timer.finish();
 
 	message_stream << "Scoring parameters: " << score_matrix << endl;
 	verbose_stream << "Build version = " << daa.diamond_build() << endl;
 	message_stream << "DB sequences = " << daa.db_seqs() << endl;
 	message_stream << "DB sequences used = " << daa.db_seqs_used() << endl;
 	message_stream << "DB letters = " << daa.db_letters() << endl;
-	
-	task_timer timer("Generating output");
+
+	init_output();
+	taxonomy.init();
+
+	timer.go("Generating output");
 	View_writer writer;
-	output_format = auto_ptr<Output_format>(get_output_format());
 
 	Binary_buffer buf;
 	size_t query_num;
@@ -152,5 +161,3 @@ void view()
 
 	output_format->print_footer(*writer.f_);
 }
-
-#endif /* VIEW_H_ */
diff --git a/src/run/double_indexed.cpp b/src/run/double_indexed.cpp
index a6fa614..ab2ede5 100644
--- a/src/run/double_indexed.cpp
+++ b/src/run/double_indexed.cpp
@@ -22,7 +22,6 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../data/queries.h"
 #include "../basic/statistics.h"
 #include "../basic/shape_config.h"
-#include "../align/align_queries.h"
 #include "../search/align_range.h"
 #include "../util/seq_file_format.h"
 #include "../data/load_seqs.h"
@@ -320,7 +319,7 @@ void master_thread_di()
 	timer2.start();
 
 	align_mode = Align_mode(Align_mode::from_command(config.command));
-	output_format = auto_ptr<Output_format>(get_output_format());
+	init_output();
 
 	message_stream << "Temporary directory: " << Temp_file::get_temp_dir() << endl;
 
@@ -343,12 +342,7 @@ void master_thread_di()
 	Config::set_option(config.db_size, (uint64_t)ref_header.letters);
 
 	set_max_open_files(config.query_bins * config.threads_ + unsigned(ref_header.letters / (size_t)(config.chunk_size * 1e9)) + 16);
-
-	if (!config.prot_accession2taxid.empty()) {
-		timer.go("Loading taxonomy");
-		taxonomy.load();
-		timer.finish();
-	}
+	taxonomy.init();
 
 	master_thread(db_file, timer2);
 }
\ No newline at end of file
diff --git a/src/run/main.cpp b/src/run/main.cpp
index 79b1fe5..db7e36d 100644
--- a/src/run/main.cpp
+++ b/src/run/main.cpp
@@ -18,7 +18,6 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 #include <iostream>
 #include "../basic/config.h"
-#include "../output/view.h"
 #include "tools.h"
 #include "../extra/compare.h"
 
@@ -31,6 +30,9 @@ void model_seqs();
 void opt();
 void run_masker();
 void fastq2fasta();
+void view();
+void make_db();
+void db_info();
 
 int main(int ac, const char* av[])
 {
@@ -95,6 +97,9 @@ int main(int ac, const char* av[])
 		case Config::fastq2fasta:
 			fastq2fasta();
 			break;
+		case Config::dbinfo:
+			db_info();
+			break;
 		default:
 			return 1;
 		}
diff --git a/src/run/tools.cpp b/src/run/tools.cpp
index 5e76d01..7b6b6c8 100644
--- a/src/run/tools.cpp
+++ b/src/run/tools.cpp
@@ -118,14 +118,14 @@ void run_masker()
 		cout << '>' << string(id.data(), id.size()) << endl;
 		seq2 = seq;
 		Masking::get()(seq2.data(), seq2.size());
-		for (size_t i = 0; i < seq.size(); ++i) {
+		/*for (size_t i = 0; i < seq.size(); ++i) {
 			char c = value_traits.alphabet[(long)seq[i]];
 			if (seq2[i] == value_traits.mask_char)
 				c = tolower(c);
 			cout << c;
 		}
-		cout << endl;
-		//cout << sequence(seq.data(), seq.size()) << endl;
+		cout << endl;*/
+		cout << sequence(seq2.data(), seq2.size()) << endl;
 	}
 }
 
diff --git a/src/util/log_stream.h b/src/util/log_stream.h
index 9aacba1..06bd34c 100644
--- a/src/util/log_stream.h
+++ b/src/util/log_stream.h
@@ -69,6 +69,11 @@ extern Message_stream log_stream;
 
 struct task_timer
 {
+	task_timer(unsigned level = 1) :
+		level_(level),
+		msg_(0)
+	{
+	}
 	task_timer(const char *msg, unsigned level=1) :
 		level_(level),
 		msg_(msg)
diff --git a/src/basic/const.h b/src/util/queue.h
similarity index 60%
copy from src/basic/const.h
copy to src/util/queue.h
index 0dd7c7a..cc24dfc 100644
--- a/src/basic/const.h
+++ b/src/util/queue.h
@@ -16,31 +16,38 @@ You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
-#ifndef CONST_H_
-#define CONST_H_
+#include "tinythread.h"
 
-struct Const
+struct Queue
 {
-
-	enum {
-		build_version = 109,
-		daa_version = 0,
-		seedp_bits = 10,
-		seedp = 1<<seedp_bits,
-		max_seed_weight = 32,
-		max_shapes = 16,
-		max_shape_len = 32
-	};
-
-	static const char* version_string;
-	static const char* program_name;
-	static const char* id_delimiters;
-
-};
-
-#define SIMPLE_SEARCH
-// #define FREQUENCY_MASKING
-// #define ST_JOIN
-// #define NO_COLLISION_FILTER
-
-#endif /* CONST_H_ */
+	enum { end = 0xffffffffffffffffllu };
+	Queue(size_t begin, size_t end) :
+		next_(begin),
+		end_(end)
+	{}
+	template<typename _f>
+	size_t get(_f &f)
+	{
+		mtx_.lock();
+		const size_t q = next_++;
+		if (q >= end_) {
+			mtx_.unlock();
+			return Queue::end;
+		}
+		f(q);
+		mtx_.unlock();
+		return q;
+	}
+	size_t next() const
+	{
+		return next_;
+	}
+	size_t get_end() const
+	{
+		return end_;
+	}
+private:
+	tthread::mutex mtx_;
+	size_t next_;
+	const size_t end_;
+};
\ No newline at end of file

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/diamond-aligner.git



More information about the debian-med-commit mailing list