[med-svn] [diamond-aligner] 01/07: New upstream version 0.9.10+dfsg
Andreas Tille
tille at debian.org
Mon Sep 18 13:16:24 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository diamond-aligner.
commit baec593e94cc4cfd918fa334eb9493a638abdc70
Author: Andreas Tille <tille at debian.org>
Date: Mon Sep 18 13:30:39 2017 +0200
New upstream version 0.9.10+dfsg
---
CMakeLists.txt | 4 +
README.rst | 2 +-
build_simple.sh | 4 +
src/ChangeLog | 14 +-
src/align/align.cpp | 288 +++++------------------
src/align/align.h | 64 +----
src/align/align_queries.h | 67 ------
src/align/align_target.cpp | 4 +
src/align/query_mapper.cpp | 133 ++++++-----
src/align/query_mapper.h | 16 ++
src/basic/basic.cpp | 4 +-
src/basic/config.cpp | 28 ++-
src/basic/config.h | 6 +-
src/basic/const.h | 2 +-
src/basic/hssp.cpp | 30 ++-
src/basic/match.h | 2 +
src/basic/sequence.h | 9 +-
src/basic/value.h | 2 +-
src/data/load_seqs.h | 12 +-
src/data/reference.cpp | 22 ++
src/data/reference.h | 9 +-
src/data/taxonomy.cpp | 81 ++++++-
src/data/taxonomy.h | 15 ++
src/dp/dp.h | 7 +-
src/dp/swipe.cpp | 2 +-
src/extra/match_file.h | 1 +
src/output/blast_pairwise_format.cpp | 5 +-
src/output/blast_tab_format.cpp | 8 +-
src/output/join_blocks.cpp | 51 +++-
src/output/output.h | 39 ++-
src/output/output_format.cpp | 25 +-
src/output/output_format.h | 69 +++++-
src/output/output_sink.cpp | 86 +++++++
src/output/sam_format.cpp | 2 +-
src/{basic/const.h => output/target_culling.cpp} | 30 +--
src/output/target_culling.h | 73 ++++++
src/output/taxon_format.cpp | 51 ++++
src/output/{view.h => view.cpp} | 67 +++---
src/run/double_indexed.cpp | 10 +-
src/run/main.cpp | 7 +-
src/run/tools.cpp | 6 +-
src/util/log_stream.h | 5 +
src/{basic/const.h => util/queue.h} | 59 +++--
43 files changed, 830 insertions(+), 591 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 05d6c12..35ee8bf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,6 +85,10 @@ add_executable(diamond src/run/main.cpp
src/data/seed_set.cpp
src/util/binary_file.cpp
src/util/simd.cpp
+ src/output/taxon_format.cpp
+ src/output/view.cpp
+ src/output/output_sink.cpp
+ src/output/target_culling.cpp
)
target_link_libraries(diamond ${ZLIB_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
diff --git a/README.rst b/README.rst
index 7d043f3..40c77f1 100644
--- a/README.rst
+++ b/README.rst
@@ -8,7 +8,7 @@ Please read the `manual <https://github.com/bbuchfink/diamond/raw/master/diamond
Installing the software on your system may be done by downloading it in binary format for immediate use::
- wget http://github.com/bbuchfink/diamond/releases/download/v0.9.8/diamond-linux64.tar.gz
+ wget http://github.com/bbuchfink/diamond/releases/download/v0.9.10/diamond-linux64.tar.gz
tar xzf diamond-linux64.tar.gz
The extracted ``diamond`` binary file should be moved to a directory contained in your executable search path (PATH environment variable).
diff --git a/build_simple.sh b/build_simple.sh
index 17fb696..6a8a4dc 100755
--- a/build_simple.sh
+++ b/build_simple.sh
@@ -53,4 +53,8 @@ g++ -DNDEBUG -O3 -Wno-deprecated-declarations $1 $2 $3 \
src/data/seed_set.cpp \
src/util/binary_file.cpp \
src/util/simd.cpp \
+ src/output/taxon_format.cpp \
+ src/output/view.cpp \
+ src/output/output_sink.cpp \
+ src/output/target_culling.cpp \
-lz -lpthread -o diamond
diff --git a/src/ChangeLog b/src/ChangeLog
index b1f7103..feaca8c 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,13 @@
+[0.9.10]
+- added --strand option to choose query strand
+- added dbinfo command
+
+[0.9.9]
+- Added taxonomic classification format.
+- Fixed a bug in getseq printing masked residues.
+- Fixed parsing of UniRef100_ sequence id prefixes.
+- Added support for using the staxids output field in diamond view.
+
[0.9.8]
- Fixed a compiler errror.
@@ -205,7 +215,7 @@
[0.7.11]
- added --version switch
-- added static build optiom for CMake build
+- added static build option for CMake build
- fixed a bug that would cause a return code of 1 without an error
[0.7.10]
@@ -270,4 +280,4 @@
- reduced usage of temporary disk space and memory
- fixed a compiler error for GCC 4.1.2
- fixed a bug that could cause the program to hang when running out of temporary space or memory
-- added --forwardonly option to view command
+- added --forwardonly option to view command
\ No newline at end of file
diff --git a/src/align/align.cpp b/src/align/align.cpp
index 8b6795c..2e60268 100644
--- a/src/align/align.cpp
+++ b/src/align/align.cpp
@@ -18,93 +18,65 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../basic/value.h"
#include "align.h"
-#include "align_queries.h"
#include "../data/reference.h"
#include "../output/output_format.h"
+#include "../util/queue.h"
+#include "../output/output.h"
+#include "query_mapper.h"
+#include "../util/merge_sort.h"
using std::map;
-auto_ptr<Simple_query_queue> Simple_query_queue::instance;
-auto_ptr<Output_sink> Output_sink::instance;
-
-size_t Simple_query_queue::get(vector<hit>::iterator &begin, vector<hit>::iterator &end)
+struct Align_fetcher
{
- mtx_.lock();
- const unsigned query = (unsigned)(next_++),
- c = align_mode.query_contexts;
- if (query >= qend_) {
- mtx_.unlock();
- return Simple_query_queue::end;
+ static void init(size_t qbegin, size_t qend, vector<hit>::iterator begin, vector<hit>::iterator end)
+ {
+ it_ = begin;
+ end_ = end;
+ queue_ = auto_ptr<Queue>(new Queue(qbegin, qend));
}
- begin = it_;
- while (it_ < end_ && it_->query_ / c == query)
- ++it_;
- end = it_;
- mtx_.unlock();
- return query;
-}
-
-void Output_sink::push(size_t n, Text_buffer *buf)
-{
- mtx_.lock();
- //cout << "n=" << n << " next=" << next_ << endl;
- if (n != next_) {
- backlog_[n] = buf;
- size_ += buf ? buf->alloc_size() : 0;
- max_size_ = std::max(max_size_, size_);
- mtx_.unlock();
+ void operator()(size_t query)
+ {
+ const unsigned q = (unsigned)query,
+ c = align_mode.query_contexts;
+ begin = it_;
+ while (it_ < end_ && it_->query_ / c == q)
+ ++it_;
+ end = it_;
+ this->query = query;
}
- else
- flush(buf);
-}
+ bool get()
+ {
+ return queue_->get(*this) != Queue::end;
+ }
+ size_t query;
+ vector<hit>::iterator begin, end;
+private:
+ static vector<hit>::iterator it_, end_;
+ static auto_ptr<Queue> queue_;
+};
-void Output_sink::flush(Text_buffer *buf)
-{
- size_t n = next_ + 1;
- vector<Text_buffer*> out;
- out.push_back(buf);
- map<size_t, Text_buffer*>::iterator i;
- do {
- while ((i = backlog_.begin()) != backlog_.end() && i->first == n) {
- out.push_back(i->second);
- backlog_.erase(i);
- ++n;
- }
- mtx_.unlock();
- size_t size = 0;
- for (vector<Text_buffer*>::iterator j = out.begin(); j < out.end(); ++j) {
- if (*j) {
- f_->write((*j)->get_begin(), (*j)->size());
- if(*j != buf)
- size += (*j)->alloc_size();
- delete *j;
- }
- }
- out.clear();
- mtx_.lock();
- size_ -= size;
- } while ((i = backlog_.begin()) != backlog_.end() && i->first == n);
- next_ = n;
- mtx_.unlock();
-}
+auto_ptr<Queue> Align_fetcher::queue_;
+vector<hit>::iterator Align_fetcher::it_;
+vector<hit>::iterator Align_fetcher::end_;
void align_worker(size_t thread_id)
{
- vector<hit>::iterator begin, end;
- size_t query;
+ Align_fetcher hits;
Statistics stat;
- while ((query = Simple_query_queue::get().get(begin, end)) != Simple_query_queue::end) {
- if (end == begin) {
+ while (hits.get()) {
+ if (hits.end == hits.begin) {
Text_buffer *buf = 0;
if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0) {
buf = new Text_buffer;
- output_format->print_query_intro(query, query_ids::get()[query].c_str(), get_source_query_len((unsigned)query), *buf, true);
- output_format->print_query_epilog(*buf, true);
+ const char *query_title = query_ids::get()[hits.query].c_str();
+ output_format->print_query_intro(hits.query, query_title, get_source_query_len((unsigned)hits.query), *buf, true);
+ output_format->print_query_epilog(*buf, query_title, true);
}
- Output_sink::get().push(query, buf);
+ Output_sink::get().push(hits.query, buf);
continue;
}
- Query_mapper mapper(query, begin, end);
+ Query_mapper mapper(hits.query, hits.begin, hits.end);
mapper.init();
if (config.ext == Config::swipe)
mapper.align_targets(stat);
@@ -113,11 +85,13 @@ void align_worker(size_t thread_id)
for (size_t i = 0; i < mapper.n_targets(); ++i)
mapper.ungapped_stage(i);
if (config.ext != Config::most_greedy) {
+ mapper.fill_source_ranges();
mapper.rank_targets(config.rank_ratio == -1 ? (mapper.query_seq(0).length() > 50 ? 0.6 : 0.9) : config.rank_ratio);
stat.inc(Statistics::TARGET_HITS1, mapper.n_targets());
const int cutoff = int(mapper.raw_score_cutoff() * config.score_ratio);
for (size_t i = 0; i < mapper.n_targets(); ++i)
mapper.greedy_stage(i, stat, cutoff);
+ mapper.fill_source_ranges();
mapper.rank_targets(config.rank_ratio2 == -1 ? (mapper.query_seq(0).length() > 50 ? 0.95 : 1.0) : config.rank_ratio2);
stat.inc(Statistics::TARGET_HITS2, mapper.n_targets());
for (size_t i = 0; i < mapper.n_targets(); ++i)
@@ -129,157 +103,18 @@ void align_worker(size_t thread_id)
buf = new Text_buffer;
const bool aligned = mapper.generate_output(*buf, stat);
if (aligned && !config.unaligned.empty())
- query_aligned[query] = true;
- }
- Output_sink::get().push(query, buf);
- }
- statistics += stat;
-}
-
-void heartbeat_worker()
-{
- static const int interval = 100;
- int n = 0;
- while (Output_sink::get().next() < Simple_query_queue::get().qend()) {
- if (n == interval) {
- const string title(query_ids::get()[Output_sink::get().next()].c_str());
- verbose_stream << "Queries=" << Simple_query_queue::get().next() << " size=" << megabytes(Output_sink::get().size()) << " max_size=" << megabytes(Output_sink::get().max_size())
- << " next=" << title.substr(0, title.find(' ')) << endl;
- n = 0;
- }
- else
- ++n;
- tthread::this_thread::sleep_for(tthread::chrono::milliseconds(10));
- }
-}
-
-Query_queue query_queue;
-
-void Query_queue::init(Trace_pt_list::iterator begin, Trace_pt_list::iterator end)
-{
- trace_pt_pos = begin;
- trace_pt_end = end;
- assert(queue.empty());
- assert(out_queue.empty());
- n = 0;
-}
-
-void Query_queue::flush(Output_stream *out, Statistics &stat)
-{
- writing = true;
- std::queue<Query_data*> q;
- while (true) {
- while (!out_queue.empty() && out_queue.front()->state == Query_data::finished) {
- q.push(out_queue.front());
- out_queue.pop();
- }
- if (q.empty()) {
- writing = false;
- lock.unlock();
- return;
- }
- lock.unlock();
-
- unsigned k = 0;
- while (!q.empty()) {
- out->write(q.front()->buf.get_begin(), q.front()->buf.size());
- delete q.front();
- q.pop();
- ++k;
+ query_aligned[hits.query] = true;
}
-
- lock.lock();
- n -= k;
- /*if (n > 100)
- cout << "qlen=" << out_queue.size() << " finished=" << n << endl;*/
+ Output_sink::get().push(hits.query, buf);
}
-}
-
-Query_data* Query_queue::get()
-{
- for (std::deque<Query_data*>::iterator i = queue.begin(); i != queue.end(); ++i)
- if ((*i)->state == Query_data::free)
- return *i;
- return 0;
-}
-
-void Query_queue::pop_busy()
-{
- while (!queue.empty() && (queue.front()->state == Query_data::closing || queue.front()->state == Query_data::finished)) {
- out_queue.push(queue.front());
- queue.pop_front();
- }
-}
-
-void align_worker(Output_stream *out)
-{
- Statistics stat;
- Query_data *data = 0;
- unsigned n_targets = 0;
-
- while (true) {
-
- query_queue.lock.lock();
-
- if (data) {
- data->mapper->targets_finished += n_targets;
- if (data->mapper->finished()) {
- query_queue.lock.unlock();
- const bool aligned = data->mapper->generate_output(data->buf, stat);
- const unsigned query_id = data->mapper->query_id;
- delete data->mapper;
- query_queue.lock.lock();
- data->state = Query_data::finished;
- if (aligned && !config.unaligned.empty())
- query_aligned[query_id] = true;
- ++query_queue.n;
- if (!query_queue.writing && !query_queue.out_queue.empty() && data == query_queue.out_queue.front()) {
- query_queue.flush(out, stat);
- data = 0;
- continue;
- }
- }
- }
-
- if (!(data = query_queue.get())) {
- if (query_queue.trace_pt_pos >= query_queue.trace_pt_end) {
- query_queue.lock.unlock();
- break;
- }
- else {
- query_queue.queue.push_back(new Query_data(new Query_mapper()));
- data = query_queue.queue.back();
- query_queue.lock.unlock();
- data->mapper->init();
- data->state = Query_data::free;
- data = 0;
- }
- }
- else {
- size_t target = data->mapper->next_target;
- n_targets = std::min(config.target_fetch_size, (unsigned)data->mapper->n_targets() - data->mapper->next_target);
- data->mapper->next_target += n_targets;
- if (target + n_targets == data->mapper->n_targets()) {
- data->state = Query_data::closing;
- query_queue.pop_busy();
- }
- query_queue.lock.unlock();
-
- for (unsigned i = 0; i < n_targets; ++i)
- data->mapper->align_target(target + i, stat);
- }
-
- }
-
statistics += stat;
}
void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file)
{
- query_queue.last_query = (unsigned)-1;
const size_t max_size = (size_t)std::min(config.chunk_size*1e9 * 9 * 2 / config.lowmem, 2e9);
pair<size_t, size_t> query_range;
- while(true) {
+ while (true) {
task_timer timer("Loading trace points", 3);
Trace_pt_list *v = new Trace_pt_list;
statistics.max(Statistics::TEMP_SPACE, trace_pts.load(*v, max_size, query_range));
@@ -291,32 +126,15 @@ void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file)
merge_sort(v->begin(), v->end(), config.threads_);
v->init();
timer.go("Computing alignments");
- if (config.load_balancing == Config::target_parallel) {
- query_queue.init(v->begin(), v->end());
- Thread_pool threads;
- for (unsigned i = 0; i < config.threads_; ++i)
- threads.push_back(launch_thread(static_cast<void (*)(Output_stream*)>(&align_worker), output_file));
- threads.join_all();
- }
- else {
- Simple_query_queue::instance = auto_ptr<Simple_query_queue>(new Simple_query_queue(query_range.first, query_range.second, v->begin(), v->end()));
- Output_sink::instance = auto_ptr<Output_sink>(new Output_sink(query_range.first, output_file));
- Thread_pool threads;
- if (config.verbosity >= 3)
- threads.push_back(launch_thread(heartbeat_worker));
- for (size_t i = 0; i < (config.threads_align == 0 ? config.threads_ : config.threads_align); ++i)
- threads.push_back(launch_thread(static_cast<void(*)(size_t)>(&align_worker), i));
- threads.join_all();
- }
+ Align_fetcher::init(query_range.first, query_range.second, v->begin(), v->end());
+ Output_sink::instance = auto_ptr<Output_sink>(new Output_sink(query_range.first, output_file));
+ Thread_pool threads;
+ if (config.verbosity >= 3)
+ threads.push_back(launch_thread(heartbeat_worker, query_range.second));
+ for (size_t i = 0; i < (config.threads_align == 0 ? config.threads_ : config.threads_align); ++i)
+ threads.push_back(launch_thread(static_cast<void(*)(size_t)>(&align_worker), i));
+ threads.join_all();
timer.go("Deallocating buffers");
delete v;
}
- if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0 && config.load_balancing == Config::target_parallel) {
- Text_buffer buf;
- for (unsigned i = query_queue.last_query + 1; i < query_ids::get().get_length(); ++i) {
- output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), buf, true);
- output_format->print_query_epilog(buf, true);
- }
- output_file->write(buf.get_begin(), buf.size());
- }
}
\ No newline at end of file
diff --git a/src/align/align.h b/src/align/align.h
index 33588b7..07d4b27 100644
--- a/src/align/align.h
+++ b/src/align/align.h
@@ -64,68 +64,6 @@ private:
Task_queue<_buffer, Output_writer> queue;
};
-struct Simple_query_queue
-{
- enum { end = 0xffffffffffffffffllu };
- Simple_query_queue(size_t qbegin, size_t qend, vector<hit>::iterator begin, vector<hit>::iterator end):
- next_(qbegin),
- qend_(qend),
- it_(begin),
- end_(end)
- {}
- size_t get(vector<hit>::iterator &begin, vector<hit>::iterator &end);
- size_t next() const
- {
- return next_;
- }
- size_t qend() const
- {
- return qend_;
- }
- static Simple_query_queue& get()
- {
- return *instance;
- }
- static auto_ptr<Simple_query_queue> instance;
-private:
- tthread::mutex mtx_;
- size_t next_;
- const size_t qend_;
- vector<hit>::iterator it_, end_;
-};
-
-struct Output_sink
-{
- Output_sink(size_t begin, Output_stream *f):
- f_(f),
- next_(begin),
- size_(0),
- max_size_(0)
- {}
- void push(size_t n, Text_buffer *buf);
- size_t size() const
- {
- return size_;
- }
- size_t max_size() const
- {
- return max_size_;
- }
- static Output_sink& get()
- {
- return *instance;
- }
- size_t next() const
- {
- return next_;
- }
- static auto_ptr<Output_sink> instance;
-private:
- void flush(Text_buffer *buf);
- tthread::mutex mtx_;
- Output_stream* const f_;
- std::map<size_t, Text_buffer*> backlog_;
- size_t next_, size_, max_size_;
-};
+void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file);
#endif
\ No newline at end of file
diff --git a/src/align/align_queries.h b/src/align/align_queries.h
deleted file mode 100644
index 8409f46..0000000
--- a/src/align/align_queries.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/****
-DIAMOND protein aligner
-Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as
-published by the Free Software Foundation, either version 3 of the
-License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU Affero General Public License for more details.
-
-You should have received a copy of the GNU Affero General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
-****/
-
-#ifndef ALIGN_QUERIES_H_
-#define ALIGN_QUERIES_H_
-
-#include <deque>
-#include "../util/merge_sort.h"
-#include "../search/trace_pt_buffer.h"
-#include "../util/map.h"
-#include "../util/task_queue.h"
-#include "align.h"
-#include "query_mapper.h"
-
-using std::vector;
-
-struct Query_data
-{
- Query_data():
- mapper(0)
- {}
- Query_data(Query_mapper *mapper):
- mapper(mapper),
- state(init)
- {}
- Query_mapper *mapper;
- Text_buffer buf;
- enum { init, free, closing, finished };
- unsigned state;
-};
-
-struct Query_queue
-{
- void init(Trace_pt_list::iterator begin, Trace_pt_list::iterator end);
- void flush(Output_stream *out, Statistics &stat);
- Query_data* get();
- void pop_busy();
-
- tthread::mutex lock;
- std::deque<Query_data*> queue;
- std::queue<Query_data*> out_queue;
- Trace_pt_list::iterator trace_pt_pos, trace_pt_end;
- bool writing;
- unsigned n, last_query;
-};
-
-extern Query_queue query_queue;
-
-void align_worker(Output_stream *out);
-void align_queries(Trace_pt_buffer &trace_pts, Output_stream* output_file);
-
-#endif /* ALIGN_QUERIES_H_ */
diff --git a/src/align/align_target.cpp b/src/align/align_target.cpp
index 43af301..e027ac6 100644
--- a/src/align/align_target.cpp
+++ b/src/align/align_target.cpp
@@ -272,6 +272,10 @@ void Query_mapper::align_target(size_t idx, Statistics &stat)
target.hsps.sort();
if(target.hsps.size() > 0)
target.filter_score = target.hsps.front().score;
+
+ target.ts.clear();
+ for (list<Hsp_data>::iterator i = target.hsps.begin(); i != target.hsps.end(); ++i)
+ target.ts.push_back(Hsp_traits(i->query_source_range));
if (config.use_smith_waterman && !target.hsps.empty()) {
int score;
diff --git a/src/align/query_mapper.cpp b/src/align/query_mapper.cpp
index 401a52e..12cb96e 100644
--- a/src/align/query_mapper.cpp
+++ b/src/align/query_mapper.cpp
@@ -16,24 +16,36 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
-#include "align_queries.h"
#include "query_mapper.h"
#include "../data/reference.h"
#include "extend_ungapped.h"
#include "../output/output.h"
#include "../output/output_format.h"
#include "../output/daa_write.h"
+#include "../output/target_culling.h"
-Query_mapper::Query_mapper() :
- source_hits(get_query_data()),
- query_id(source_hits.first->query_ / align_mode.query_contexts),
- targets_finished(0),
- next_target(0),
- source_query_len(get_source_query_len(query_id)),
- unaligned_from(query_queue.last_query+1)
-{
- query_queue.last_query = query_id;
- seed_hits.reserve(source_hits.second - source_hits.first);
+bool Target::envelopes(const Hsp_traits &t, double p) const
+{
+ for (list<Hsp_traits>::const_iterator i = ts.begin(); i != ts.end(); ++i)
+ if (t.query_source_range.overlap_factor(i->query_source_range) >= p)
+ return true;
+ return false;
+}
+
+bool Target::is_enveloped(const Target &t, double p) const
+{
+ for (list<Hsp_traits>::const_iterator i = ts.begin(); i != ts.end(); ++i)
+ if (!t.envelopes(*i, p))
+ return false;
+ return true;
+}
+
+bool Target::is_enveloped(Ptr_vector<Target>::const_iterator begin, Ptr_vector<Target>::const_iterator end, double p, int min_score) const
+{
+ for (; begin != end; ++begin)
+ if (is_enveloped(**begin, p) && (*begin)->filter_score >= min_score)
+ return true;
+ return false;
}
int Query_mapper::raw_score_cutoff() const
@@ -70,18 +82,6 @@ void Query_mapper::init()
rank_targets(config.rank_ratio == -1 ? 0.6 : config.rank_ratio);
}
-pair<Trace_pt_list::iterator, Trace_pt_list::iterator> Query_mapper::get_query_data()
-{
- const Trace_pt_list::iterator begin = query_queue.trace_pt_pos;
- if (begin == query_queue.trace_pt_end)
- return pair<Trace_pt_list::iterator, Trace_pt_list::iterator>(begin, begin);
- const unsigned c = align_mode.query_contexts, query = begin->query_ / c;
- Trace_pt_list::iterator end = begin;
- for (; end < query_queue.trace_pt_end && end->query_ / c == query; ++end);
- query_queue.trace_pt_pos = end;
- return pair<Trace_pt_list::iterator, Trace_pt_list::iterator>(begin, end);
-}
-
unsigned Query_mapper::count_targets()
{
std::sort(source_hits.first, source_hits.second, hit::cmp_subject);
@@ -132,49 +132,64 @@ void Query_mapper::rank_targets(double ratio)
{
std::sort(targets.begin(), targets.end(), Target::compare);
- int score = 0;
- if (config.toppercent < 100) {
- score = int((double)targets[0].filter_score * (1.0 - config.toppercent / 100.0) * ratio);
+ if (config.query_overlap_culling == 0.0) {
+
+ int score = 0;
+ if (config.toppercent < 100) {
+ score = int((double)targets[0].filter_score * (1.0 - config.toppercent / 100.0) * ratio);
+ }
+ else {
+ size_t min_idx = std::min(targets.size(), (size_t)config.max_alignments);
+ score = int((double)targets[min_idx - 1].filter_score * ratio);
+ }
+
+ unsigned i = 0;
+ for (; i < targets.size(); ++i)
+ if (targets[i].filter_score < score)
+ break;
+
+ if (config.benchmark_ranking)
+ for (unsigned j = i; j < targets.size(); ++j)
+ targets[j].outranked = true;
+ else
+ targets.erase(targets.begin() + i, targets.end());
+
}
else {
- size_t min_idx = std::min(targets.size(), (size_t)config.max_alignments);
- score = int((double)targets[min_idx - 1].filter_score * ratio);
- }
- unsigned i = 0;
- for (; i < targets.size(); ++i)
- if (targets[i].filter_score < score)
- break;
-
- if (config.benchmark_ranking)
- for (unsigned j = i; j < targets.size(); ++j)
- targets[j].outranked = true;
- else
- targets.erase(targets.begin() + i, targets.end());
+ int i = 0;
+ const double p = config.query_overlap_culling / 100.0;
+ while (i < (int)targets.size()) {
+ int min_score = int((double)targets[i].filter_score / (1.0 - config.toppercent / 100.0) / ratio);
+ if (targets[i].is_enveloped(targets.begin(), targets.begin() + i, p, min_score))
+ targets.erase(targets.begin() + i, targets.begin() + i + 1);
+ else
+ ++i;
+ }
+
+ }
}
bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat)
{
- if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0 && config.load_balancing == Config::target_parallel) {
- for (unsigned i = unaligned_from; i < query_id; ++i) {
- output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), buffer, true);
- output_format->print_query_epilog(buffer, true);
- }
- }
-
std::sort(targets.begin(), targets.end(), Target::compare);
unsigned n_hsp = 0, n_target_seq = 0, hit_hsps = 0;
- const unsigned top_score = targets.empty() ? 0 : targets[0].filter_score, query_len = (unsigned)query_seq(0).length();
+ auto_ptr<Target_culling> target_culling(Target_culling::get(targets.empty() ? 0 : &targets[0]));
+ const unsigned query_len = (unsigned)query_seq(0).length();
size_t seek_pos = 0;
const char *query_title = query_ids::get()[query_id].c_str();
+ auto_ptr<Output_format> f(output_format->clone());
for (size_t i = 0; i < targets.size(); ++i) {
if ((config.min_bit_score == 0 && score_matrix.evalue(targets[i].filter_score, config.db_size, query_len) > config.max_evalue)
|| score_matrix.bitscore(targets[i].filter_score) < config.min_bit_score)
break;
- if (!config.output_range(n_target_seq, targets[i].filter_score, top_score))
+ const int c = target_culling->cull(targets[i]);
+ if (c == Target_culling::next)
+ continue;
+ else if (c == Target_culling::finished)
break;
if (targets[i].outranked)
@@ -202,15 +217,15 @@ bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat)
}
else {
if (n_hsp == 0) {
- if (*output_format == Output_format::daa)
+ if (*f == Output_format::daa)
seek_pos = write_daa_query_record(buffer, query_title, align_mode.query_translated ? query_source_seqs::get()[query_id] : query_seqs::get()[query_id]);
else
- output_format->print_query_intro(query_id, query_title, source_query_len, buffer, false);
+ f->print_query_intro(query_id, query_title, source_query_len, buffer, false);
}
- if (*output_format == Output_format::daa)
+ if (*f == Output_format::daa)
write_daa_record(buffer, *j, query_id, targets[i].subject_id);
else
- output_format->print_match(Hsp_context(*j,
+ f->print_match(Hsp_context(*j,
query_id,
query_seq(j->frame),
query_source_seq(),
@@ -223,8 +238,10 @@ bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat)
hit_hsps), buffer);
}
- if(hit_hsps == 0)
+ if (hit_hsps == 0) {
++n_target_seq;
+ target_culling->add(&targets[i]);
+ }
++n_hsp;
++hit_hsps;
if (config.alignment_traceback && j->gap_openings > 0)
@@ -235,17 +252,17 @@ bool Query_mapper::generate_output(Text_buffer &buffer, Statistics &stat)
if (n_hsp > 0) {
if (!blocked_processing) {
- if (*output_format == Output_format::daa)
+ if (*f == Output_format::daa)
finish_daa_query_record(buffer, seek_pos);
else
- output_format->print_query_epilog(buffer, false);
+ f->print_query_epilog(buffer, query_title, false);
}
else
Intermediate_record::finish_query(buffer, seek_pos);
}
- else if (!blocked_processing && *output_format != Output_format::daa && config.report_unaligned != 0) {
- output_format->print_query_intro(query_id, query_title, source_query_len, buffer, true);
- output_format->print_query_epilog(buffer, true);
+ else if (!blocked_processing && *f != Output_format::daa && config.report_unaligned != 0) {
+ f->print_query_intro(query_id, query_title, source_query_len, buffer, true);
+ f->print_query_epilog(buffer, query_title, true);
}
if (!blocked_processing) {
diff --git a/src/align/query_mapper.h b/src/align/query_mapper.h
index 5da19c3..bbcd8b8 100644
--- a/src/align/query_mapper.h
+++ b/src/align/query_mapper.h
@@ -34,6 +34,9 @@ using std::list;
struct Target
{
+ Target(int score):
+ filter_score(score)
+ {}
Target(size_t begin, unsigned subject_id) :
subject_id(subject_id),
filter_score(0),
@@ -44,6 +47,14 @@ struct Target
{
return lhs->filter_score > rhs->filter_score;
}
+ void fill_source_ranges(size_t query_len)
+ {
+ for (list<Hsp_traits>::iterator i = ts.begin(); i != ts.end(); ++i)
+ i->query_source_range = align_mode.query_translated ? untranslate_range(i->query_range, i->frame, query_len) : i->query_range;
+ }
+ bool envelopes(const Hsp_traits &t, double p) const;
+ bool is_enveloped(const Target &t, double p) const;
+ bool is_enveloped(Ptr_vector<Target>::const_iterator begin, Ptr_vector<Target>::const_iterator end, double p, int min_score) const;
unsigned subject_id;
int filter_score;
float filter_time;
@@ -78,6 +89,11 @@ struct Query_mapper
{
return query_seqs::get()[query_id*align_mode.query_contexts + frame];
}
+ void fill_source_ranges()
+ {
+ for (size_t i = 0; i < targets.size(); ++i)
+ targets[i].fill_source_ranges(source_query_len);
+ }
pair<Trace_pt_list::iterator, Trace_pt_list::iterator> source_hits;
unsigned query_id, targets_finished, next_target;
private:
diff --git a/src/basic/basic.cpp b/src/basic/basic.cpp
index aa6fab3..5455ea3 100644
--- a/src/basic/basic.cpp
+++ b/src/basic/basic.cpp
@@ -24,9 +24,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "sequence.h"
#include "masking.h"
-const char* Const::version_string = "0.9.8";
+const char* Const::version_string = "0.9.10";
const char* Const::program_name = "diamond";
-const char* Const::id_delimiters = " \a\b\f\n\r\t\v";
+const char* Const::id_delimiters = " \a\b\f\n\r\t\v\1";
Value_traits::Value_traits(const char *alphabet, Letter mask_char, const char *ignore) :
alphabet(alphabet),
diff --git a/src/basic/config.cpp b/src/basic/config.cpp
index bb3fe7a..edbace2 100644
--- a/src/basic/config.cpp
+++ b/src/basic/config.cpp
@@ -56,7 +56,8 @@ Config::Config(int argc, const char **argv)
.add_command("model-seqs", "")
.add_command("opt", "")
.add_command("mask", "")
- .add_command("fastq2fasta", "");
+ .add_command("fastq2fasta", "")
+ .add_command("dbinfo", "");
Options_group general("General options");
general.add()
@@ -111,10 +112,12 @@ Config::Config(int argc, const char **argv)
Options_group aligner("Aligner options");
aligner.add()
("query", 'q', "input query file", query_file)
+ ("strand", 0, "query strands to search (both/minus/plus)", query_strands, string("both"))
("un", 0, "file for unaligned queries", unaligned)
("unal", 0, "report unaligned queries (0=no, 1=yes)", report_unaligned, -1)
("max-target-seqs", 'k', "maximum number of target sequences to report alignments for", max_alignments, uint64_t(25))
("top", 0, "report alignments within this percentage range of top alignment score (overrides --max-target-seqs)", toppercent, 100.0)
+ ("overlap-culling", 0, "delete hits only if a higher scoring hit envelops the given percentage of the query range", query_overlap_culling)
("compress", 0, "compression for output files (0=none, 1=gzip)", compression)
("evalue", 'e', "maximum e-value to report alignments (default=0.001)", max_evalue, 0.001)
("min-score", 0, "minimum bit score to report alignments (overrides e-value setting)", min_bit_score)
@@ -137,8 +140,10 @@ Config::Config(int argc, const char **argv)
//("seg", 0, "enable SEG masking of queries (yes/no)", seg)
("query-gencode", 0, "genetic code to use to translate query (see user manual)", query_gencode, 1u)
("salltitles", 0, "include full subject titles in DAA file", salltitles)
+ ("sallseqid", 0, "include all subject ids in DAA file", sallseqid)
("no-self-hits", 0, "suppress reporting of identical self hits", no_self_hits)
- ("taxonmap", 0, "protein accession to taxid mapping file", prot_accession2taxid);
+ ("taxonmap", 0, "protein accession to taxid mapping file", prot_accession2taxid)
+ ("taxonnodes", 0, "taxonomy nodes.dmp from NCBI", nodesdmp);
Options_group advanced("Advanced options");
advanced.add()
@@ -230,7 +235,7 @@ Config::Config(int argc, const char **argv)
throw std::runtime_error("Invalid option: --block-size/-b. Block size is set for the alignment commands.");
break;
case Config::blastp:
- case Config::blastx:
+ case Config::blastx:
if (database == "")
throw std::runtime_error("Missing parameter: database file (--db/-d)");
if (daa_file.length() > 0) {
@@ -254,6 +259,12 @@ Config::Config(int argc, const char **argv)
;
}
+ switch (command) {
+ case Config::dbinfo:
+ if (database == "")
+ throw std::runtime_error("Missing parameter: database file (--db/-d)");
+ }
+
if (hit_cap != 0)
throw std::runtime_error("Deprecated parameter: --max-hits/-C.");
@@ -353,14 +364,6 @@ Config::Config(int argc, const char **argv)
#ifdef __POPCNT__
verbose_stream << "POPCNT enabled." << endl;
#endif
-
- message_stream << "#Target sequences to report alignments for: ";
- if (max_alignments == 0) {
- max_alignments = std::numeric_limits<uint64_t>::max();
- message_stream << "unlimited" << endl;
- }
- else
- message_stream << max_alignments << endl;
}
Translator::init(query_gencode);
@@ -371,6 +374,9 @@ Config::Config(int argc, const char **argv)
if (command == help)
parser.print_help();
+ if (query_strands != "both" && query_strands != "minus" && query_strands != "plus")
+ throw std::runtime_error("Invalid value for parameter --strand");
+
/*log_stream << "sizeof(hit)=" << sizeof(hit) << " sizeof(packed_uint40_t)=" << sizeof(packed_uint40_t)
<< " sizeof(sorted_list::entry)=" << sizeof(sorted_list::entry) << endl;*/
}
diff --git a/src/basic/config.h b/src/basic/config.h
index 0b1035b..a348d46 100644
--- a/src/basic/config.h
+++ b/src/basic/config.h
@@ -139,10 +139,14 @@ struct Config
double score_ratio;
bool small_query;
bool hashed_seeds;
+ string nodesdmp;
+ bool sallseqid;
+ double query_overlap_culling;
+ string query_strands;
enum {
makedb = 0, blastp = 1, blastx = 2, view = 3, help = 4, version = 5, getseq = 6, benchmark = 7, random_seqs = 8, compare = 9, sort = 10, roc = 11, db_stat = 12, model_sim = 13,
- match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta=18
+ match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta=18, dbinfo=19
};
unsigned command;
diff --git a/src/basic/const.h b/src/basic/const.h
index 0dd7c7a..a2107ba 100644
--- a/src/basic/const.h
+++ b/src/basic/const.h
@@ -23,7 +23,7 @@ struct Const
{
enum {
- build_version = 109,
+ build_version = 111,
daa_version = 0,
seedp_bits = 10,
seedp = 1<<seedp_bits,
diff --git a/src/basic/hssp.cpp b/src/basic/hssp.cpp
index d0134d9..579523e 100644
--- a/src/basic/hssp.cpp
+++ b/src/basic/hssp.cpp
@@ -18,6 +18,21 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../align/align.h"
+interval untranslate_range(const interval &r, unsigned frame, size_t len)
+{
+ signed f = frame <= 2 ? frame + 1 : 2 - frame;
+ interval s;
+ if (f > 0) {
+ s.begin_ = (f - 1) + 3 * r.begin_;
+ s.end_ = s.begin_ + 3 * r.length();
+ }
+ else {
+ s.end_ = (int)len + f - 3 * r.begin_ + 1;
+ s.begin_ = s.end_ - 3 * r.length();
+ }
+ return s;
+}
+
bool Hsp_data::pass_through(const Diagonal_segment &d) const
{
if (intersect(d.query_range(), query_range).length() != d.len
@@ -80,19 +95,8 @@ void Hsp_data::set_source_range(unsigned frame, unsigned dna_len)
this->frame = frame;
if (!align_mode.query_translated)
query_source_range = query_range;
- else {
- signed f = frame <= 2 ? frame + 1 : 2 - frame;
- if (f > 0) {
- query_source_range.begin_ = (f - 1) + 3 * query_range.begin_;
- query_source_range.end_ = query_source_range.begin_ + 3 * query_range.length();
- //query_end_dna = (f-1) + 3 * (l.query_begin_+l.query_len_-1) + 3;
- }
- else {
- query_source_range.end_ = dna_len + f - 3 * query_range.begin_ + 1;
- //query_end_dna = dna_len + (f + 1) - 3 * (l.query_begin_+l.query_len_-1) - 2;
- query_source_range.begin_ = query_source_range.end_ - 3 * query_range.length();
- }
- }
+ else
+ query_source_range = untranslate_range(query_range, frame, dna_len);
}
Hsp_context& Hsp_context::parse()
diff --git a/src/basic/match.h b/src/basic/match.h
index 555ecb6..991d40d 100644
--- a/src/basic/match.h
+++ b/src/basic/match.h
@@ -38,6 +38,8 @@ inline interval normalized_range(unsigned pos, int len, Strand strand)
: interval (pos + 1 + len, pos + 1);
}
+interval untranslate_range(const interval &r, unsigned frame, size_t l);
+
struct Diagonal_segment
{
Diagonal_segment():
diff --git a/src/basic/sequence.h b/src/basic/sequence.h
index 349a9de..b52f3fb 100644
--- a/src/basic/sequence.h
+++ b/src/basic/sequence.h
@@ -100,8 +100,13 @@ struct sequence
}
std::ostream& print(std::ostream &os, const Value_traits &v) const
{
- for (unsigned i = 0; i<len_; ++i)
- os << v.alphabet[(long)data_[i]];
+ for (unsigned i = 0; i < len_; ++i) {
+ long l = (long)data_[i];
+ if ((l & 128) == 0)
+ os << v.alphabet[l];
+ else
+ os << (char)tolower(v.alphabet[l & 127]);
+ }
return os;
}
std::ostream& print(std::ostream &os, const Value_traits &v, Reversed) const
diff --git a/src/basic/value.h b/src/basic/value.h
index 5f7a256..58467dc 100644
--- a/src/basic/value.h
+++ b/src/basic/value.h
@@ -76,7 +76,7 @@ private:
struct Value_traits
{
- Value_traits(const char *alphabet, Letter mask_char, const char *ignore);
+ Value_traits(const char *alphabet, Letter mask_char, const char *ignore);
const char *alphabet;
unsigned alphabet_size;
Letter mask_char;
diff --git a/src/data/load_seqs.h b/src/data/load_seqs.h
index a203779..fd67db3 100644
--- a/src/data/load_seqs.h
+++ b/src/data/load_seqs.h
@@ -24,7 +24,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../basic/translate.h"
#include "../util/seq_file_format.h"
-inline size_t push_seq(Sequence_set &ss, Sequence_set** source_seqs, const vector<Letter> &seq)
+inline size_t push_seq(Sequence_set &ss, Sequence_set** source_seqs, const vector<Letter> &seq, unsigned frame_mask)
{
if (config.command == Config::blastp || config.command == Config::makedb || config.command == Config::random_seqs) {
ss.push_back(seq);
@@ -42,7 +42,7 @@ inline size_t push_seq(Sequence_set &ss, Sequence_set** source_seqs, const vecto
unsigned bestFrames(Translator::computeGoodFrames(proteins, config.get_run_len((unsigned)seq.size() / 3)));
for (unsigned j = 0; j < 6; ++j) {
- if (bestFrames & (1 << j))
+ if ((bestFrames & (1 << j)) && (frame_mask & (1 << j)))
ss.push_back(proteins[j]);
else
ss.fill(proteins[j].size(), value_traits.mask_char);
@@ -68,10 +68,16 @@ inline size_t load_seqs(Input_stream &file,
vector<char> id;
string id2;
+ unsigned frame_mask = (1 << 6) - 1;
+ if (config.query_strands == "plus")
+ frame_mask = (1 << 3) - 1;
+ else if (config.query_strands == "minus")
+ frame_mask = ((1 << 3) - 1) << 3;
+
while (letters < max_letters && format.get_seq(id, seq, file)) {
if (seq.size() > 0 && (filter.empty() || id2.assign(id.data(), id.data() + id.size()).find(filter, 0) != string::npos)) {
ids->push_back(id);
- letters += push_seq(**seqs, source_seqs, seq);
+ letters += push_seq(**seqs, source_seqs, seq, frame_mask);
++n;
if ((*seqs)->get_length() >(size_t)std::numeric_limits<int>::max())
throw std::runtime_error("Number of sequences in file exceeds supported maximum.");
diff --git a/src/data/reference.cpp b/src/data/reference.cpp
index c86caef..4e15766 100644
--- a/src/data/reference.cpp
+++ b/src/data/reference.cpp
@@ -39,6 +39,18 @@ bool blocked_processing;
using std::cout;
using std::endl;
+string* get_allseqids(const char *s)
+{
+ string *r = new string;
+ const vector<string> t(tokenize(s, "\1"));
+ for (vector<string>::const_iterator i = t.begin(); i != t.end(); ++i) {
+ if (i != t.begin())
+ r->append("\1");
+ r->append(i->substr(0, find_first_of(i->c_str(), Const::id_delimiters)));
+ }
+ return r;
+}
+
struct Pos_record
{
Pos_record()
@@ -211,4 +223,14 @@ void Database_file::get_seq()
seq.clear();
id.clear();
}
+}
+
+void db_info()
+{
+ Database_file db_file;
+ cout << "Database format version = " << ref_header.db_version << endl;
+ cout << "Diamond build = " << ref_header.build << endl;
+ cout << "Sequences = " << ref_header.sequences << endl;
+ cout << "Letters = " << ref_header.letters << endl;
+ db_file.close();
}
\ No newline at end of file
diff --git a/src/data/reference.h b/src/data/reference.h
index 7eb8bb5..c74f5e9 100644
--- a/src/data/reference.h
+++ b/src/data/reference.h
@@ -129,6 +129,8 @@ inline size_t max_id_len(const String_set<0> &ids)
return max;
}
+string* get_allseqids(const char *s);
+
struct Ref_map
{
Ref_map():
@@ -157,10 +159,13 @@ struct Ref_map
n = next_++;
data_[block][i] = n;
len_.push_back((uint32_t)ref_seqs::get().length(i));
+ const char *title = ref_ids::get()[i].c_str();
if (config.salltitles)
- name_.push_back(new string(ref_ids::get()[i].c_str()));
+ name_.push_back(new string(title));
+ else if (config.sallseqid)
+ name_.push_back(get_allseqids(title));
else
- name_.push_back(get_str(ref_ids::get()[i].c_str(), Const::id_delimiters));
+ name_.push_back(get_str(title, Const::id_delimiters));
mtx_.unlock();
}
return n;
diff --git a/src/data/taxonomy.cpp b/src/data/taxonomy.cpp
index 838f29f..9172a56 100644
--- a/src/data/taxonomy.cpp
+++ b/src/data/taxonomy.cpp
@@ -17,10 +17,16 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <stdio.h>
+#include <set>
#include "taxonomy.h"
#include "../util/compressed_stream.h"
#include "../basic/config.h"
#include "../util/merge_sort.h"
+#include "../util/log_stream.h"
+
+using std::cout;
+using std::endl;
+using std::set;
Taxonomy taxonomy;
@@ -28,7 +34,7 @@ string& get_accession(string &t)
{
size_t i;
if (t.compare(0, 6, "UniRef") == 0)
- t.erase(0, 9);
+ t.erase(0, t.find('_', 0) + 1);
else if ((i = t.find_first_of('|', 0)) != string::npos) {
if (t.compare(0, 3, "gi|") == 0) {
t.erase(0, t.find_first_of('|', i + 1) + 1);
@@ -62,5 +68,78 @@ void Taxonomy::load()
/*if (f.line_count % 10000 == 0)
std::cout << f.line_count << endl;*/
}
+ f.close();
merge_sort(accession2taxid_.begin(), accession2taxid_.end(), config.threads_);
+}
+
+void Taxonomy::load_nodes()
+{
+ Input_stream f(config.nodesdmp);
+ unsigned taxid, parent;
+ while (!f.eof() && (f.getline(), !f.line.empty())) {
+ if (sscanf(f.line.c_str(), "%u\t|\t%u", &taxid, &parent) != 2)
+ throw std::runtime_error("Invalid nodes.dmp file format.");
+ //cout << taxid << '\t' << parent << endl;
+ parent_.resize(taxid + 1);
+ parent_[taxid] = parent;
+ }
+ f.close();
+}
+
+void Taxonomy::init()
+{
+ task_timer timer;
+ if (!config.prot_accession2taxid.empty()) {
+ timer.go("Loading taxonomy");
+ load();
+ timer.finish();
+ }
+
+ if (!config.nodesdmp.empty()) {
+ timer.go("Loading taxonomy nodes");
+ load_nodes();
+ timer.finish();
+ }
+}
+
+void Taxonomy::get_taxids(const char *id, set<unsigned> &taxons) const
+{
+ const vector<string> t(tokenize(id, "\1"));
+ for (vector<string>::const_iterator i = t.begin(); i < t.end(); ++i) {
+ const unsigned id = get(Taxonomy::Accession(*i));
+ if(id != 0)
+ taxons.insert(id);
+ }
+}
+
+unsigned Taxonomy::get_lca(unsigned t1, unsigned t2) const
+{
+ static const int max = 64;
+ if (t1 == t2 || t2 == 0)
+ return t1;
+ if (t1 == 0)
+ return t2;
+ unsigned p = t2;
+ set<unsigned> l;
+ int n = 0;
+ do {
+ p = get_parent(p);
+ if (p == 0)
+ return t1;
+ l.insert(p);
+ if (++n > max)
+ throw std::runtime_error("Path in taxonomy too long (1).");
+ } while (p != t1 && p != 1);
+ if (p == t1)
+ return p;
+ p = t1;
+ n = 0;
+ while (l.find(p) == l.end()) {
+ p = get_parent(p);
+ if (p == 0)
+ return t2;
+ if (++n > max)
+ throw std::runtime_error("Path in taxonomy too long (2).");
+ }
+ return p;
}
\ No newline at end of file
diff --git a/src/data/taxonomy.h b/src/data/taxonomy.h
index 1ce6dc5..1a2537f 100644
--- a/src/data/taxonomy.h
+++ b/src/data/taxonomy.h
@@ -19,6 +19,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include <vector>
#include <algorithm>
#include <string>
+#include <set>
#include "../basic/const.h"
#include "../util/util.h"
@@ -71,7 +72,10 @@ struct Taxonomy
char s[max_accesion_len];
};
+ void init();
void load();
+ void load_nodes();
+ void get_taxids(const char *s, std::set<unsigned> &taxons) const;
unsigned get(const Accession &accession) const
{
@@ -82,9 +86,20 @@ struct Taxonomy
return 0;
}
+ unsigned get_parent(unsigned taxid) const
+ {
+ if (parent_.size() <= taxid)
+ throw std::runtime_error(string("No taxonomy node found for taxon id ") + to_string(taxid));
+ return parent_[taxid];
+ }
+
+ unsigned get_lca(unsigned t1, unsigned t2) const;
+
private:
std::vector<std::pair<Accession, unsigned> > accession2taxid_;
+ std::vector<unsigned> parent_;
+
};
extern Taxonomy taxonomy;
\ No newline at end of file
diff --git a/src/dp/dp.h b/src/dp/dp.h
index dbdec70..6a76900 100644
--- a/src/dp/dp.h
+++ b/src/dp/dp.h
@@ -94,6 +94,11 @@ struct Hsp_traits
score(0),
frame((int)frame)
{}
+ Hsp_traits(const interval &query_source_range):
+ query_source_range(query_source_range)
+ {}
+ Hsp_traits(const Hsp_data &hsp)
+ {}
int partial_score(const Diagonal_segment &d) const
{
const double overlap = std::max(d.subject_range().overlap_factor(subject_range), d.query_range().overlap_factor(query_range));
@@ -142,7 +147,7 @@ struct Hsp_traits
}
};
int d_min, d_max, score, frame;
- interval query_range, subject_range;
+ interval query_source_range, query_range, subject_range;
};
template<typename _score>
diff --git a/src/dp/swipe.cpp b/src/dp/swipe.cpp
index d09b2ee..9b3e070 100644
--- a/src/dp/swipe.cpp
+++ b/src/dp/swipe.cpp
@@ -246,4 +246,4 @@ void swipe(const sequence &query, vector<sequence>::const_iterator subject_begin
#ifdef __SSE2__
swipe<uint8_t>(query, subject_begin, subject_end, out);
#endif
-}
+}
\ No newline at end of file
diff --git a/src/extra/match_file.h b/src/extra/match_file.h
index cd2ef69..75252d7 100644
--- a/src/extra/match_file.h
+++ b/src/extra/match_file.h
@@ -21,6 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include <stdio.h>
#include <vector>
+#include "../util/binary_file.h"
#include "blast_record.h"
#include "../util/util.h"
#include "../basic/reduction.h"
diff --git a/src/output/blast_pairwise_format.cpp b/src/output/blast_pairwise_format.cpp
index d4c4b20..1096e97 100644
--- a/src/output/blast_pairwise_format.cpp
+++ b/src/output/blast_pairwise_format.cpp
@@ -18,7 +18,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "output_format.h"
-void Pairwise_format::print_match(const Hsp_context& r, Text_buffer &out) const
+void Pairwise_format::print_match(const Hsp_context& r, Text_buffer &out)
{
static const unsigned width = 60;
out << '>';
@@ -67,9 +67,8 @@ void Pairwise_format::print_footer(Output_stream &out) const
}
-void Pairwise_format::print_query_epilog(Text_buffer &out, bool unaligned) const
+void Pairwise_format::print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const
{
-
}
void Pairwise_format::print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const
diff --git a/src/output/blast_tab_format.cpp b/src/output/blast_tab_format.cpp
index a85147a..59fafd3 100644
--- a/src/output/blast_tab_format.cpp
+++ b/src/output/blast_tab_format.cpp
@@ -95,17 +95,17 @@ Blast_tab_format::Blast_tab_format() :
void print_staxids(Text_buffer &out, const char *id)
{
- const vector<string> t(tokenize(id, "\1"));
std::set<unsigned> taxons;
- for (vector<string>::const_iterator i = t.begin(); i < t.end(); ++i)
- taxons.insert(taxonomy.get(Taxonomy::Accession(*i)));
+ taxonomy.get_taxids(id, taxons);
+ if (taxons.empty())
+ return;
std::set<unsigned>::const_iterator i = taxons.begin();
out << *(i++);
for (; i != taxons.end(); ++i)
out << ';' << *i;
}
-void Blast_tab_format::print_match(const Hsp_context& r, Text_buffer &out) const
+void Blast_tab_format::print_match(const Hsp_context& r, Text_buffer &out)
{
for (vector<unsigned>::const_iterator i = fields.begin(); i != fields.end(); ++i) {
switch (*i) {
diff --git a/src/output/join_blocks.cpp b/src/output/join_blocks.cpp
index 0b2e63e..353e06b 100644
--- a/src/output/join_blocks.cpp
+++ b/src/output/join_blocks.cpp
@@ -21,6 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../data/queries.h"
#include "../output/daa_write.h"
#include "output_format.h"
+#include "../align/query_mapper.h"
struct Join_fetcher
{
@@ -123,7 +124,35 @@ struct Join_record
};
-void join_query(vector<Binary_buffer> &buf, Text_buffer &out, Statistics &statistics, unsigned query, const char *query_name, unsigned query_source_len)
+struct Join_records
+{
+ Join_records(vector<Binary_buffer> &buf)
+ {
+ for (unsigned i = 0; i < current_ref_block; ++i) {
+ it.push_back(buf[i].begin());
+ Join_record::push_next(i, std::numeric_limits<unsigned>::max(), it.back(), records);
+ }
+ std::make_heap(records.begin(), records.end());
+ }
+ Target* get_target()
+ {
+ if (records.empty())
+ return 0;
+ const unsigned block = records.front().block_;
+ const unsigned subject = records.front().info_.subject_id;
+ Target *t = new Target(records.front().info_.score);
+ do {
+ std::pop_heap(records.begin(), records.end());
+ records.pop_back();
+ if (Join_record::push_next(block, subject, it[block], records))
+ std::push_heap(records.begin(), records.end());
+ } while (!records.empty() && records.front().info_.subject_id == subject);
+ }
+ vector<Join_record> records;
+ vector<Binary_buffer::Iterator> it;
+};
+
+void join_query(vector<Binary_buffer> &buf, Text_buffer &out, Statistics &statistics, unsigned query, const char *query_name, unsigned query_source_len, Output_format &f)
{
sequence context[6];
for (unsigned i = 0; i < align_mode.query_contexts; ++i)
@@ -148,11 +177,11 @@ void join_query(vector<Binary_buffer> &buf, Text_buffer &out, Statistics &statis
if (config.output_range(n_target_seq, next.info_.score, top_score) || same_subject) {
//printf("q=%u s=%u n=%u ss=%u\n",query, next.info_.subject_id, n_target_seq, same_subject, next.info_.score);
- if(*output_format == Output_format::daa)
+ if(f == Output_format::daa)
write_daa_record(out, next.info_);
else {
Hsp_data hsp(next.info_, query_source_len);
- output_format->print_match(Hsp_context(hsp,
+ f.print_match(Hsp_context(hsp,
query,
context[align_mode.check_context(hsp.frame)],
align_mode.query_translated ? query_source_seqs::get()[query] : context[0],
@@ -205,21 +234,23 @@ void join_worker(Task_queue<Text_buffer,Join_writer> *queue)
if (*output_format != Output_format::daa && config.report_unaligned != 0) {
for (unsigned i = fetcher.unaligned_from; i < fetcher.query_id; ++i) {
output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), *out, true);
- output_format->print_query_epilog(*out, true);
+ output_format->print_query_epilog(*out, query_ids::get()[i].c_str(), true);
}
}
- if (*output_format == Output_format::daa)
+ auto_ptr<Output_format> f(output_format->clone());
+
+ if (*f == Output_format::daa)
seek_pos = write_daa_query_record(*out, query_name, query_seq);
else
- output_format->print_query_intro(fetcher.query_id, query_name, (unsigned)query_seq.length(), *out, false);
+ f->print_query_intro(fetcher.query_id, query_name, (unsigned)query_seq.length(), *out, false);
- join_query(fetcher.buf, *out, stat, fetcher.query_id, query_name, (unsigned)query_seq.length());
+ join_query(fetcher.buf, *out, stat, fetcher.query_id, query_name, (unsigned)query_seq.length(), *f);
- if (*output_format == Output_format::daa)
+ if (*f == Output_format::daa)
finish_daa_query_record(*out, seek_pos);
else
- output_format->print_query_epilog(*out, false);
+ f->print_query_epilog(*out, query_name, false);
queue->push(n);
}
@@ -242,7 +273,7 @@ void join_blocks(unsigned ref_blocks, Output_stream &master_out, const vector<Te
Text_buffer out;
for (unsigned i = Join_fetcher::query_last + 1; i < query_ids::get().get_length(); ++i) {
output_format->print_query_intro(i, query_ids::get()[i].c_str(), get_source_query_len(i), out, true);
- output_format->print_query_epilog(out, true);
+ output_format->print_query_epilog(out, query_ids::get()[i].c_str(), true);
}
writer(out);
}
diff --git a/src/output/output.h b/src/output/output.h
index 03fe1fe..8bef7f5 100644
--- a/src/output/output.h
+++ b/src/output/output.h
@@ -19,6 +19,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#ifndef OUTPUT_H_
#define OUTPUT_H_
+#include <map>
#include "../util/binary_file.h"
#include "../basic/packed_transcript.h"
#include "../util/text_buffer.h"
@@ -98,8 +99,42 @@ struct Intermediate_record
Packed_transcript transcript;
};
-#ifndef ST_JOIN
void join_blocks(unsigned ref_blocks, Output_stream &master_out, const vector<Temp_file> &tmp_file);
-#endif
+
+struct Output_sink
+{
+ Output_sink(size_t begin, Output_stream *f) :
+ f_(f),
+ next_(begin),
+ size_(0),
+ max_size_(0)
+ {}
+ void push(size_t n, Text_buffer *buf);
+ size_t size() const
+ {
+ return size_;
+ }
+ size_t max_size() const
+ {
+ return max_size_;
+ }
+ static Output_sink& get()
+ {
+ return *instance;
+ }
+ size_t next() const
+ {
+ return next_;
+ }
+ static auto_ptr<Output_sink> instance;
+private:
+ void flush(Text_buffer *buf);
+ tthread::mutex mtx_;
+ Output_stream* const f_;
+ std::map<size_t, Text_buffer*> backlog_;
+ size_t next_, size_, max_size_;
+};
+
+void heartbeat_worker(size_t qend);
#endif
\ No newline at end of file
diff --git a/src/output/output_format.cpp b/src/output/output_format.cpp
index 74786f6..ef6113b 100644
--- a/src/output/output_format.cpp
+++ b/src/output/output_format.cpp
@@ -84,12 +84,31 @@ Output_format* get_output_format()
return new Pairwise_format;
else if (f[0] == "null")
return new Null_format;
+ else if (f[0] == "102")
+ return new Taxon_format;
else
- throw std::runtime_error("Invalid output format. Allowed values: 5,6,100,101");
+ throw std::runtime_error("Invalid output format. Allowed values: 0,5,6,100,101,102");
}
+void init_output()
+{
+ output_format = auto_ptr<Output_format>(get_output_format());
+ if (*output_format == Output_format::taxon && config.toppercent == 100.0)
+ config.toppercent = 10.0;
+ if (config.toppercent == 100.0) {
+ message_stream << "#Target sequences to report alignments for: ";
+ if (config.max_alignments == 0) {
+ config.max_alignments = std::numeric_limits<uint64_t>::max();
+ message_stream << "unlimited" << endl;
+ }
+ else
+ message_stream << config.max_alignments << endl;
+ }
+ else
+ message_stream << "Percentage range of top alignment score to report hits: " << config.toppercent << endl;
+}
-void XML_format::print_match(const Hsp_context &r, Text_buffer &out) const
+void XML_format::print_match(const Hsp_context &r, Text_buffer &out)
{
if(r.hsp_num == 0) {
if (r.hit_num > 0)
@@ -183,7 +202,7 @@ void XML_format::print_query_intro(size_t query_num, const char *query_name, uns
<< "<Iteration_hits>" << '\n';
}
-void XML_format::print_query_epilog(Text_buffer &out, bool unaligned) const
+void XML_format::print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const
{
if (!unaligned) {
out << " </Hit_hsps>" << '\n'
diff --git a/src/output/output_format.h b/src/output/output_format.h
index 4ed34a5..15880de 100644
--- a/src/output/output_format.h
+++ b/src/output/output_format.h
@@ -34,14 +34,15 @@ struct Output_format
{}
virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const
{}
- virtual void print_query_epilog(Text_buffer &out, bool unaligned) const
+ virtual void print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const
{}
- virtual void print_match(const Hsp_context& r, Text_buffer &out) const
+ virtual void print_match(const Hsp_context& r, Text_buffer &out)
{}
virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const
{ }
virtual void print_footer(Output_stream &f) const
{ }
+ virtual Output_format* clone() const = 0;
virtual ~Output_format()
{ }
static void print_title(Text_buffer &buf, const char *id, bool full_titles, bool all_titles, const char *separator);
@@ -50,7 +51,7 @@ struct Output_format
return code;
}
unsigned code;
- enum { daa, blast_tab, blast_xml, sam, blast_pairwise, null };
+ enum { daa, blast_tab, blast_xml, sam, blast_pairwise, null, taxon };
};
extern auto_ptr<Output_format> output_format;
@@ -60,6 +61,10 @@ struct Null_format : public Output_format
Null_format() :
Output_format(null)
{}
+ virtual Output_format* clone() const
+ {
+ return new Null_format(*this);
+ }
};
struct DAA_format : public Output_format
@@ -67,6 +72,10 @@ struct DAA_format : public Output_format
DAA_format():
Output_format(daa)
{}
+ virtual Output_format* clone() const
+ {
+ return new DAA_format(*this);
+ }
};
struct Blast_tab_format : public Output_format
@@ -74,9 +83,13 @@ struct Blast_tab_format : public Output_format
static const char* field_str[];
Blast_tab_format();
virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
- virtual void print_match(const Hsp_context& r, Text_buffer &out) const;
+ virtual void print_match(const Hsp_context& r, Text_buffer &out);
virtual ~Blast_tab_format()
{ }
+ virtual Output_format* clone() const
+ {
+ return new Blast_tab_format(*this);
+ }
vector<unsigned> fields;
};
@@ -85,11 +98,15 @@ struct Sam_format : public Output_format
Sam_format():
Output_format(sam)
{ }
- virtual void print_match(const Hsp_context& r, Text_buffer &out) const;
+ virtual void print_match(const Hsp_context& r, Text_buffer &out);
virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const;
virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
virtual ~Sam_format()
{ }
+ virtual Output_format* clone() const
+ {
+ return new Sam_format(*this);
+ }
};
struct XML_format : public Output_format
@@ -99,13 +116,17 @@ struct XML_format : public Output_format
{
config.salltitles = true;
}
- virtual void print_match(const Hsp_context &r, Text_buffer &out) const;
+ virtual void print_match(const Hsp_context &r, Text_buffer &out);
virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const;
virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
- virtual void print_query_epilog(Text_buffer &out, bool unaligned) const;
+ virtual void print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const;
virtual void print_footer(Output_stream &f) const;
virtual ~XML_format()
{ }
+ virtual Output_format* clone() const
+ {
+ return new XML_format(*this);
+ }
};
struct Pairwise_format : public Output_format
@@ -115,16 +136,46 @@ struct Pairwise_format : public Output_format
{
config.salltitles = true;
}
- virtual void print_match(const Hsp_context &r, Text_buffer &out) const;
+ virtual void print_match(const Hsp_context &r, Text_buffer &out);
virtual void print_header(Output_stream &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const;
virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, Text_buffer &out, bool unaligned) const;
- virtual void print_query_epilog(Text_buffer &out, bool unaligned) const;
+ virtual void print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const;
virtual void print_footer(Output_stream &f) const;
virtual ~Pairwise_format()
{ }
+ virtual Output_format* clone() const
+ {
+ return new Pairwise_format(*this);
+ }
+};
+
+struct Taxon_format : public Output_format
+{
+ Taxon_format() :
+ Output_format(taxon),
+ taxid(0),
+ evalue(std::numeric_limits<double>::max())
+ {
+ config.salltitles = true;
+ if (config.prot_accession2taxid.empty())
+ throw std::runtime_error("Output format requires setting the --taxonmap parameter.");
+ if (config.nodesdmp.empty())
+ throw std::runtime_error("Output format requires setting the --taxonnodes parameter.");
+ }
+ virtual void print_match(const Hsp_context &r, Text_buffer &out);
+ virtual void print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const;
+ virtual ~Taxon_format()
+ { }
+ virtual Output_format* clone() const
+ {
+ return new Taxon_format(*this);
+ }
+ unsigned taxid;
+ double evalue;
};
Output_format* get_output_format();
+void init_output();
void print_hsp(Hsp_data &hsp, sequence query);
#endif /* OUTPUT_FORMAT_H_ */
diff --git a/src/output/output_sink.cpp b/src/output/output_sink.cpp
new file mode 100644
index 0000000..d4e8f8b
--- /dev/null
+++ b/src/output/output_sink.cpp
@@ -0,0 +1,86 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "output.h"
+#include "../data/queries.h"
+
+using std::map;
+using std::auto_ptr;
+
+auto_ptr<Output_sink> Output_sink::instance;
+
+void Output_sink::push(size_t n, Text_buffer *buf)
+{
+ mtx_.lock();
+ //cout << "n=" << n << " next=" << next_ << endl;
+ if (n != next_) {
+ backlog_[n] = buf;
+ size_ += buf ? buf->alloc_size() : 0;
+ max_size_ = std::max(max_size_, size_);
+ mtx_.unlock();
+ }
+ else
+ flush(buf);
+}
+
+void Output_sink::flush(Text_buffer *buf)
+{
+ size_t n = next_ + 1;
+ vector<Text_buffer*> out;
+ out.push_back(buf);
+ map<size_t, Text_buffer*>::iterator i;
+ do {
+ while ((i = backlog_.begin()) != backlog_.end() && i->first == n) {
+ out.push_back(i->second);
+ backlog_.erase(i);
+ ++n;
+ }
+ mtx_.unlock();
+ size_t size = 0;
+ for (vector<Text_buffer*>::iterator j = out.begin(); j < out.end(); ++j) {
+ if (*j) {
+ f_->write((*j)->get_begin(), (*j)->size());
+ if (*j != buf)
+ size += (*j)->alloc_size();
+ delete *j;
+ }
+ }
+ out.clear();
+ mtx_.lock();
+ size_ -= size;
+ } while ((i = backlog_.begin()) != backlog_.end() && i->first == n);
+ next_ = n;
+ mtx_.unlock();
+}
+
+void heartbeat_worker(size_t qend)
+{
+ static const int interval = 100;
+ int n = 0;
+ while (Output_sink::get().next() < qend) {
+ if (n == interval) {
+ const string title(query_ids::get()[Output_sink::get().next()].c_str());
+ verbose_stream << "Queries=" << Output_sink::get().next() << " size=" << megabytes(Output_sink::get().size()) << " max_size=" << megabytes(Output_sink::get().max_size())
+ << " next=" << title.substr(0, title.find(' ')) << endl;
+ n = 0;
+ }
+ else
+ ++n;
+ tthread::this_thread::sleep_for(tthread::chrono::milliseconds(10));
+ }
+}
\ No newline at end of file
diff --git a/src/output/sam_format.cpp b/src/output/sam_format.cpp
index 932e406..d8a4469 100644
--- a/src/output/sam_format.cpp
+++ b/src/output/sam_format.cpp
@@ -82,7 +82,7 @@ void Sam_format::print_query_intro(size_t query_num, const char *query_name, uns
}
}
-void Sam_format::print_match(const Hsp_context& r, Text_buffer &out) const
+void Sam_format::print_match(const Hsp_context& r, Text_buffer &out)
{
out.write_until(r.query_name, Const::id_delimiters);
out << '\t' << '0' << '\t';
diff --git a/src/basic/const.h b/src/output/target_culling.cpp
similarity index 62%
copy from src/basic/const.h
copy to src/output/target_culling.cpp
index 0dd7c7a..52df2fe 100644
--- a/src/basic/const.h
+++ b/src/output/target_culling.cpp
@@ -16,31 +16,9 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
-#ifndef CONST_H_
-#define CONST_H_
+#include "target_culling.h"
-struct Const
+Target_culling* Target_culling::get(const Target* first)
{
-
- enum {
- build_version = 109,
- daa_version = 0,
- seedp_bits = 10,
- seedp = 1<<seedp_bits,
- max_seed_weight = 32,
- max_shapes = 16,
- max_shape_len = 32
- };
-
- static const char* version_string;
- static const char* program_name;
- static const char* id_delimiters;
-
-};
-
-#define SIMPLE_SEARCH
-// #define FREQUENCY_MASKING
-// #define ST_JOIN
-// #define NO_COLLISION_FILTER
-
-#endif /* CONST_H_ */
+ return config.query_overlap_culling == 0.0 ? new Target_culling(first) : new Overlap_culling;
+}
\ No newline at end of file
diff --git a/src/output/target_culling.h b/src/output/target_culling.h
new file mode 100644
index 0000000..365b548
--- /dev/null
+++ b/src/output/target_culling.h
@@ -0,0 +1,73 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#ifndef TARGET_CULLING_H_
+#define TARGET_CULLING_H_
+
+#include <vector>
+#include "../align/query_mapper.h"
+
+struct Target_culling
+{
+ Target_culling():
+ top_score_(0)
+ {}
+ Target_culling(const Target *first):
+ n_(0),
+ top_score_(first ? first->filter_score : 0)
+ {}
+ virtual int cull(const Target &t)
+ {
+ return config.output_range((unsigned)n_, t.filter_score, top_score_) ? use : finished;
+ }
+ virtual void add(Target *t)
+ {
+ ++n_;
+ }
+ enum { finished = 0, next = 1, use = 2};
+ static Target_culling* get(const Target* first);
+private:
+ size_t n_;
+ const int top_score_;
+};
+
+struct Overlap_culling : Target_culling
+{
+ Overlap_culling()
+ {}
+ virtual int cull(const Target &t)
+ {
+ return t.is_enveloped(targets_.begin(), targets_.end(), config.query_overlap_culling / 100.0, int(double(t.filter_score) / (1.0 - config.toppercent / 100.0)))
+ ? next : use;
+ }
+ virtual void add(Target *t)
+ {
+ targets_.push_back(t);
+ }
+ virtual void add(unsigned n, const Hsp_data &hsp)
+ {
+ if (n >= targets_.size()) {
+ targets_.push_back(new Target(hsp.score));
+ }
+ targets_[n]->ts.push_back(Hsp_traits(hsp));
+ }
+private:
+ std::vector<Target*> targets_;
+};
+
+#endif
\ No newline at end of file
diff --git a/src/output/taxon_format.cpp b/src/output/taxon_format.cpp
new file mode 100644
index 0000000..5979ae4
--- /dev/null
+++ b/src/output/taxon_format.cpp
@@ -0,0 +1,51 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2017 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include <set>
+#include "output_format.h"
+#include "../data/taxonomy.h"
+
+using std::set;
+
+void Taxon_format::print_match(const Hsp_context &r, Text_buffer &out)
+{
+ set<unsigned> taxons;
+ taxonomy.get_taxids(r.subject_name, taxons);
+ if (taxons.empty())
+ return;
+ evalue = std::min(evalue, r.evalue());
+ try {
+ for (set<unsigned>::const_iterator i = taxons.begin(); i != taxons.end(); ++i)
+ taxid = taxonomy.get_lca(taxid, *i);
+ }
+ catch (std::exception &) {
+ std::cerr << "Query=" << r.query_name << endl << "Subject=" << r.subject_name << endl;
+ throw;
+ }
+}
+
+void Taxon_format::print_query_epilog(Text_buffer &out, const char *query_title, bool unaligned) const
+{
+ out.write_until(query_title, Const::id_delimiters);
+ out << '\t' << taxid << '\t';
+ if (taxid != 0)
+ out.print_e(evalue);
+ else
+ out << '0';
+ out << '\n';
+}
\ No newline at end of file
diff --git a/src/output/view.h b/src/output/view.cpp
similarity index 73%
rename from src/output/view.h
rename to src/output/view.cpp
index 21a2ba6..2a66b01 100644
--- a/src/output/view.h
+++ b/src/output/view.cpp
@@ -16,9 +16,6 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
-#ifndef VIEW_H_
-#define VIEW_H_
-
#include "../basic/config.h"
#include "../util/binary_file.h"
#include "../util/text_buffer.h"
@@ -28,13 +25,14 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../util/task_queue.h"
#include "../basic/score_matrix.h"
#include "../util/thread.h"
+#include "../data/taxonomy.h"
const unsigned view_buf_size = 32;
struct View_writer
{
- View_writer():
- f_ (config.compression == 1
+ View_writer() :
+ f_(config.compression == 1
? new Compressed_ostream(config.output_file)
: new Output_stream(config.output_file))
{ }
@@ -52,17 +50,18 @@ struct View_writer
struct View_fetcher
{
- View_fetcher(DAA_file &daa):
- daa (daa)
+ View_fetcher(DAA_file &daa) :
+ daa(daa)
{ }
bool operator()()
{
n = 0;
- for(unsigned i=0;i<view_buf_size;++i)
+ for (unsigned i = 0; i<view_buf_size; ++i)
if (!daa.read_query_buffer(buf[i], query_num)) {
query_num -= n - 1;
return false;
- } else
+ }
+ else
++n;
query_num -= n - 1;
return true;
@@ -73,63 +72,73 @@ struct View_fetcher
DAA_file &daa;
};
-void view_query(DAA_query_record &r, Text_buffer &out, const Output_format &format)
+void view_query(DAA_query_record &r, Text_buffer &out, Output_format &format)
{
- format.print_query_intro(r.query_num, r.query_name.c_str(), (unsigned)r.query_len(), out, false);
- for (DAA_query_record::Match_iterator i = r.begin(); i.good(); ++i) {
+ auto_ptr<Output_format> f(format.clone());
+ f->print_query_intro(r.query_num, r.query_name.c_str(), (unsigned)r.query_len(), out, false);
+ DAA_query_record::Match_iterator i = r.begin();
+ const unsigned top_score = i.good() ? i->score : 0;
+ for (; i.good(); ++i) {
if (i->frame > 2 && config.forwardonly)
continue;
- format.print_match(i->context(), out);
+ if (!config.output_range(i->hit_num, i->score, top_score))
+ break;
+ f->print_match(i->context(), out);
}
- format.print_query_epilog(out, false);
+ f->print_query_epilog(out, r.query_name.c_str(), false);
}
struct View_context
{
- View_context(DAA_file &daa, View_writer &writer, const Output_format &format):
- daa (daa),
- writer (writer),
- queue (3*config.threads_, writer),
- format (format)
+ View_context(DAA_file &daa, View_writer &writer, Output_format &format) :
+ daa(daa),
+ writer(writer),
+ queue(3 * config.threads_, writer),
+ format(format)
{ }
void operator()(unsigned thread_id)
{
try {
size_t n;
- View_fetcher query_buf (daa);
+ View_fetcher query_buf(daa);
Text_buffer *buffer = 0;
- while(queue.get(n, buffer, query_buf)) {
+ while (queue.get(n, buffer, query_buf)) {
for (unsigned j = 0; j < query_buf.n; ++j) {
DAA_query_record r(daa, query_buf.buf[j], query_buf.query_num + j);
view_query(r, *buffer, format);
}
queue.push(n);
}
- } catch(std::exception &e) {
+ }
+ catch (std::exception &e) {
std::cout << e.what() << std::endl;
std::terminate();
}
}
DAA_file &daa;
View_writer &writer;
- Task_queue<Text_buffer,View_writer> queue;
- const Output_format &format;
+ Task_queue<Text_buffer, View_writer> queue;
+ Output_format &format;
};
void view()
{
- DAA_file daa (config.daa_file);
+ task_timer timer("Loading subject IDs");
+ DAA_file daa(config.daa_file);
score_matrix = Score_matrix("", daa.lambda(), daa.kappa(), daa.gap_open_penalty(), daa.gap_extension_penalty());
+ timer.finish();
message_stream << "Scoring parameters: " << score_matrix << endl;
verbose_stream << "Build version = " << daa.diamond_build() << endl;
message_stream << "DB sequences = " << daa.db_seqs() << endl;
message_stream << "DB sequences used = " << daa.db_seqs_used() << endl;
message_stream << "DB letters = " << daa.db_letters() << endl;
-
- task_timer timer("Generating output");
+
+ init_output();
+ taxonomy.init();
+
+ timer.go("Generating output");
View_writer writer;
- output_format = auto_ptr<Output_format>(get_output_format());
Binary_buffer buf;
size_t query_num;
@@ -152,5 +161,3 @@ void view()
output_format->print_footer(*writer.f_);
}
-
-#endif /* VIEW_H_ */
diff --git a/src/run/double_indexed.cpp b/src/run/double_indexed.cpp
index a6fa614..ab2ede5 100644
--- a/src/run/double_indexed.cpp
+++ b/src/run/double_indexed.cpp
@@ -22,7 +22,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../data/queries.h"
#include "../basic/statistics.h"
#include "../basic/shape_config.h"
-#include "../align/align_queries.h"
#include "../search/align_range.h"
#include "../util/seq_file_format.h"
#include "../data/load_seqs.h"
@@ -320,7 +319,7 @@ void master_thread_di()
timer2.start();
align_mode = Align_mode(Align_mode::from_command(config.command));
- output_format = auto_ptr<Output_format>(get_output_format());
+ init_output();
message_stream << "Temporary directory: " << Temp_file::get_temp_dir() << endl;
@@ -343,12 +342,7 @@ void master_thread_di()
Config::set_option(config.db_size, (uint64_t)ref_header.letters);
set_max_open_files(config.query_bins * config.threads_ + unsigned(ref_header.letters / (size_t)(config.chunk_size * 1e9)) + 16);
-
- if (!config.prot_accession2taxid.empty()) {
- timer.go("Loading taxonomy");
- taxonomy.load();
- timer.finish();
- }
+ taxonomy.init();
master_thread(db_file, timer2);
}
\ No newline at end of file
diff --git a/src/run/main.cpp b/src/run/main.cpp
index 79b1fe5..db7e36d 100644
--- a/src/run/main.cpp
+++ b/src/run/main.cpp
@@ -18,7 +18,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include <iostream>
#include "../basic/config.h"
-#include "../output/view.h"
#include "tools.h"
#include "../extra/compare.h"
@@ -31,6 +30,9 @@ void model_seqs();
void opt();
void run_masker();
void fastq2fasta();
+void view();
+void make_db();
+void db_info();
int main(int ac, const char* av[])
{
@@ -95,6 +97,9 @@ int main(int ac, const char* av[])
case Config::fastq2fasta:
fastq2fasta();
break;
+ case Config::dbinfo:
+ db_info();
+ break;
default:
return 1;
}
diff --git a/src/run/tools.cpp b/src/run/tools.cpp
index 5e76d01..7b6b6c8 100644
--- a/src/run/tools.cpp
+++ b/src/run/tools.cpp
@@ -118,14 +118,14 @@ void run_masker()
cout << '>' << string(id.data(), id.size()) << endl;
seq2 = seq;
Masking::get()(seq2.data(), seq2.size());
- for (size_t i = 0; i < seq.size(); ++i) {
+ /*for (size_t i = 0; i < seq.size(); ++i) {
char c = value_traits.alphabet[(long)seq[i]];
if (seq2[i] == value_traits.mask_char)
c = tolower(c);
cout << c;
}
- cout << endl;
- //cout << sequence(seq.data(), seq.size()) << endl;
+ cout << endl;*/
+ cout << sequence(seq2.data(), seq2.size()) << endl;
}
}
diff --git a/src/util/log_stream.h b/src/util/log_stream.h
index 9aacba1..06bd34c 100644
--- a/src/util/log_stream.h
+++ b/src/util/log_stream.h
@@ -69,6 +69,11 @@ extern Message_stream log_stream;
struct task_timer
{
+ task_timer(unsigned level = 1) :
+ level_(level),
+ msg_(0)
+ {
+ }
task_timer(const char *msg, unsigned level=1) :
level_(level),
msg_(msg)
diff --git a/src/basic/const.h b/src/util/queue.h
similarity index 60%
copy from src/basic/const.h
copy to src/util/queue.h
index 0dd7c7a..cc24dfc 100644
--- a/src/basic/const.h
+++ b/src/util/queue.h
@@ -16,31 +16,38 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
-#ifndef CONST_H_
-#define CONST_H_
+#include "tinythread.h"
-struct Const
+struct Queue
{
-
- enum {
- build_version = 109,
- daa_version = 0,
- seedp_bits = 10,
- seedp = 1<<seedp_bits,
- max_seed_weight = 32,
- max_shapes = 16,
- max_shape_len = 32
- };
-
- static const char* version_string;
- static const char* program_name;
- static const char* id_delimiters;
-
-};
-
-#define SIMPLE_SEARCH
-// #define FREQUENCY_MASKING
-// #define ST_JOIN
-// #define NO_COLLISION_FILTER
-
-#endif /* CONST_H_ */
+ enum { end = 0xffffffffffffffffllu };
+ Queue(size_t begin, size_t end) :
+ next_(begin),
+ end_(end)
+ {}
+ template<typename _f>
+ size_t get(_f &f)
+ {
+ mtx_.lock();
+ const size_t q = next_++;
+ if (q >= end_) {
+ mtx_.unlock();
+ return Queue::end;
+ }
+ f(q);
+ mtx_.unlock();
+ return q;
+ }
+ size_t next() const
+ {
+ return next_;
+ }
+ size_t get_end() const
+ {
+ return end_;
+ }
+private:
+ tthread::mutex mtx_;
+ size_t next_;
+ const size_t end_;
+};
\ No newline at end of file
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/diamond-aligner.git
More information about the debian-med-commit
mailing list