[med-svn] [Git][med-team/diamond-aligner][master] 3 commits: New upstream version 0.9.26+dfsg
Steffen Möller
gitlab at salsa.debian.org
Sat Sep 21 10:41:18 BST 2019
Steffen Möller pushed to branch master at Debian Med / diamond-aligner
Commits:
30dedeaf by Steffen Moeller at 2019-09-19T22:20:39Z
New upstream version 0.9.26+dfsg
- - - - -
8c872d92 by Steffen Moeller at 2019-09-19T22:20:39Z
New upstream version
- - - - -
e8ce40ae by Steffen Moeller at 2019-09-19T22:20:40Z
Update upstream source from tag 'upstream/0.9.26+dfsg'
Update to upstream version '0.9.26+dfsg'
with Debian dir 5cb4cd797303a0519400544609fb808e036d039a
- - - - -
30 changed files:
- CMakeLists.txt
- README.md
- build_simple.sh
- debian/changelog
- src/ChangeLog
- src/align/banded_swipe_pipeline.cpp
- src/basic/basic.cpp
- src/basic/config.cpp
- src/basic/config.h
- src/basic/const.h
- src/basic/masking.cpp
- src/basic/masking.h
- src/data/ref_dictionary.cpp
- src/data/reference.cpp
- src/data/reference.h
- src/data/seed_array.cpp
- src/data/seed_array.h
- src/dp/needleman_wunsch.cpp
- src/dp/swipe/banded_3frame_swipe.cpp
- + src/lib/tantan/LambdaCalculator.cc
- + src/lib/tantan/LambdaCalculator.hh
- src/run/cluster.cpp
- src/run/double_indexed.cpp
- src/run/tools.cpp
- src/util/command_line_parser.cpp
- src/util/io/file_backed_buffer.h
- src/util/io/input_file.cpp
- src/util/io/input_file.h
- src/util/io/temp_file.cpp
- src/util/io/temp_file.h
Changes:
=====================================
CMakeLists.txt
=====================================
@@ -149,6 +149,7 @@ add_executable(diamond src/run/main.cpp
src/tools/tools.cpp
src/util/system/getRSS.cpp
src/util/math/sparse_matrix.cpp
+ src/lib/tantan/LambdaCalculator.cc
)
if(EXTRA)
=====================================
README.md
=====================================
@@ -21,11 +21,6 @@ Keep posted about new developments by following me on
![image](https://anaconda.org/bioconda/diamond/badges/platforms.svg)
[![image](https://anaconda.org/bioconda/diamond/badges/downloads.svg)](https://anaconda.org/bioconda/diamond)
-Support
-=======
-
-The preferred support channel is the [Diamond community website](http://www.diamondsearch.org/). It provides a platform for users to exchange their experiences and get support directly from the developer. You may also use the GitHub issue tracker or send inquiries by email.
-
Quick start guide
=================
@@ -37,7 +32,7 @@ quick example for setting up and using the program on Linux.
Installing the software on your system may be done by downloading it in
binary format for immediate use:
- wget http://github.com/bbuchfink/diamond/releases/download/v0.9.25/diamond-linux64.tar.gz
+ wget http://github.com/bbuchfink/diamond/releases/download/v0.9.26/diamond-linux64.tar.gz
tar xzf diamond-linux64.tar.gz
The extracted `diamond` binary file should be moved to a directory
@@ -78,6 +73,11 @@ The output file here is specified with the `–o` option and named
- The default e-value cutoff of DIAMOND is 0.001 while that of
BLAST is 10, so by default the program will search a lot more
stringently than BLAST and not report weak hits.
+
+Support
+=======
+
+The preferred support channel is the [Diamond community website](http://www.diamondsearch.org/). It provides a platform for users to exchange their experiences and get support directly from the developer. You may also use the GitHub issue tracker or send inquiries by email.
About
=====
=====================================
build_simple.sh
=====================================
@@ -81,4 +81,5 @@ g++ -std=gnu++11 -DNDEBUG -O3 -Wno-deprecated-declarations $1 $2 $3 \
src/tools/tools.cpp \
src/util/system/getRSS.cpp \
src/util/math/sparse_matrix.cpp \
+ src/lib/tantan/LambdaCalculator.cc \
-lz -lpthread -o diamond
=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+diamond-aligner (0.9.26+dfsg-1) UNRELEASED; urgency=medium
+
+ * Team upload.
+ * New upstream version
+
+ -- Steffen Moeller <moeller at debian.org> Fri, 20 Sep 2019 00:20:39 +0200
+
diamond-aligner (0.9.25+dfsg-1) unstable; urgency=medium
* New upstream version
=====================================
src/ChangeLog
=====================================
@@ -1,3 +1,13 @@
+[0.9.26]
+- Fixed a bug that could cause undefined behaviour when using a database file of format version < 2.
+- Fixed a compiler error when compiled as generic C++.
+- Program will no longer terminate with an error if unlink system call fails.
+- Added option `--tantan-minMaskProb` to set minimum repeat probability for tantan masking and changed the default value to 0.9.
+- Added option `--tantan-maxRepeatOffset` to set maximum tandem repeat period to consider and changed the default value to 15.
+- Added option `--tantan-ungapped` to use tantan in ungapped mode and changed the default to gapped mode.
+- Changed score matrix lambda calculation for tantan masking.
+- Reference masking is recomputed during alignment runs.
+
[0.9.25]
- Added support for the `sscinames` output field to print subject scientific names to the tabular output format.
- Fixed a compiler error for GCC 8.2.
=====================================
src/align/banded_swipe_pipeline.cpp
=====================================
@@ -165,7 +165,6 @@ void build_ranking_worker(PtrVector<::Target>::iterator begin, PtrVector<::Targe
void Pipeline::run(Statistics &stat)
{
- //cout << "Query=" << query_ids::get()[this->query_id].c_str() << endl;
task_timer timer("Init banded swipe pipeline", target_parallel ? 3 : UINT_MAX);
Config::set_option(config.padding, 32);
if (n_targets() == 0)
=====================================
src/basic/basic.cpp
=====================================
@@ -24,7 +24,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "sequence.h"
#include "masking.h"
-const char* Const::version_string = "0.9.25";
+const char* Const::version_string = "0.9.26";
const char* Const::program_name = "diamond";
const char* Const::id_delimiters = " \a\b\f\n\r\t\v\1";
=====================================
src/basic/config.cpp
=====================================
@@ -197,7 +197,10 @@ Config::Config(int argc, const char **argv)
("dbsize", 0, "effective database size (in letters)", db_size)
("no-auto-append", 0, "disable auto appending of DAA and DMND file extensions", no_auto_append)
("xml-blord-format", 0, "Use gnl|BL_ORD_ID| style format in XML output", xml_blord_format)
- ("stop-match-score", 0, "Set the match score of stop codons against each other.", stop_match_score, 1);
+ ("stop-match-score", 0, "Set the match score of stop codons against each other.", stop_match_score, 1)
+ ("tantan-minMaskProb", 0, "minimum repeat probability for masking (0.9)", tantan_minMaskProb, 0.9)
+ ("tantan-maxRepeatOffset", 0, "maximum tandem repeat period to consider (50)", tantan_maxRepeatOffset, 15)
+ ("tantan-ungapped", 0, "use tantan masking in ungapped mode", tantan_ungapped);
Options_group view_options("View options");
view_options.add()
@@ -274,8 +277,6 @@ Config::Config(int argc, const char **argv)
("query-parallel-limit", 0, "", query_parallel_limit, 1000000u)
("hard-masked", 0, "", hardmasked)
("cbs-window", 0, "", cbs_window, 40)
- ("tantan-r", 0, "", tantan_r, 0.005)
- ("tantan-s", 0, "", tantan_s, 0.5)
("no-unlink", 0, "", no_unlink)
("no-dict", 0, "", no_dict);
=====================================
src/basic/config.h
=====================================
@@ -175,10 +175,12 @@ struct Config
bool hardmasked;
int cbs_window;
double tantan_r;
- double tantan_s;
+ double tantan_minMaskProb;
bool no_unlink;
bool no_dict;
int stop_match_score;
+ int tantan_maxRepeatOffset;
+ bool tantan_ungapped;
enum {
makedb = 0, blastp = 1, blastx = 2, view = 3, help = 4, version = 5, getseq = 6, benchmark = 7, random_seqs = 8, compare = 9, sort = 10, roc = 11, db_stat = 12, model_sim = 13,
=====================================
src/basic/const.h
=====================================
@@ -23,7 +23,7 @@ struct Const
{
enum {
- build_version = 126,
+ build_version = 127,
seedp_bits = 10,
seedp = 1<<seedp_bits,
max_seed_weight = 32,
=====================================
src/basic/masking.cpp
=====================================
@@ -17,8 +17,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <math.h>
+#include <algorithm>
#include "masking.h"
#include "../lib/tantan/tantan.hh"
+#include "../lib/tantan/LambdaCalculator.hh"
using namespace std;
@@ -27,12 +29,21 @@ const uint8_t Masking::bit_mask = 128;
Masking::Masking(const Score_matrix &score_matrix)
{
- const double lambda = score_matrix.lambda(); // 0.324032
+ const unsigned n = value_traits.alphabet_size;
+ int int_matrix[20][20], *int_matrix_ptr[20];
+ std::copy(int_matrix, int_matrix + 20, int_matrix_ptr);
+ for (int i = 0; i < 20; ++i)
+ for (int j = 0; j < 20; ++j)
+ int_matrix[i][j] = score_matrix((char)i, (char)j);
+ cbrc::LambdaCalculator lc;
+ lc.calculate(int_matrix_ptr, 20);
+
+ const double lambda = lc.lambda(); // 0.324032
for (unsigned i = 0; i < size; ++i) {
mask_table_x_[i] = value_traits.mask_char;
mask_table_bit_[i] = (uint8_t)i | bit_mask;
for (unsigned j = 0; j < size; ++j)
- if (i < value_traits.alphabet_size && j < value_traits.alphabet_size)
+ if (i < n && j < n)
likelihoodRatioMatrix_[i][j] = exp(lambda * score_matrix(i, j));
}
std::copy(likelihoodRatioMatrix_, likelihoodRatioMatrix_ + size, probMatrixPointers_);
@@ -44,22 +55,22 @@ Masking::Masking(const Score_matrix &score_matrix)
void Masking::operator()(Letter *seq, size_t len) const
{
- tantan::maskSequences((tantan::uchar*)seq, (tantan::uchar*)(seq + len), 50,
+ tantan::maskSequences((tantan::uchar*)seq, (tantan::uchar*)(seq + len), config.tantan_maxRepeatOffset,
(tantan::const_double_ptr*)probMatrixPointers_,
- config.tantan_r, 0.05,
+ 0.005, 0.05,
0.9,
- 0, 0,
- config.tantan_s, (const tantan::uchar*)mask_table_x_);
+ config.tantan_ungapped ? 0.0 : firstGapProb_, config.tantan_ungapped ? 0.0 : otherGapProb_,
+ config.tantan_minMaskProb, (const tantan::uchar*)mask_table_x_);
}
void Masking::mask_bit(Letter *seq, size_t len) const
{
- tantan::maskSequences((tantan::uchar*)seq, (tantan::uchar*)(seq + len), 50,
+ tantan::maskSequences((tantan::uchar*)seq, (tantan::uchar*)(seq + len), config.tantan_maxRepeatOffset,
(tantan::const_double_ptr*)probMatrixPointers_,
- config.tantan_r, 0.05,
+ 0.005, 0.05,
0.9,
- 0, 0,
- config.tantan_s, (const tantan::uchar*)mask_table_bit_);
+ config.tantan_ungapped ? 0.0 : firstGapProb_, config.tantan_ungapped ? 0.0 : otherGapProb_,
+ config.tantan_minMaskProb, (const tantan::uchar*)mask_table_bit_);
}
void Masking::bit_to_hard_mask(Letter *seq, size_t len, size_t &n) const
@@ -88,7 +99,7 @@ void mask_worker(Atomic<size_t> *next, Sequence_set *seqs, const Masking *maskin
masking->mask_bit(seqs->ptr(i), seqs->length(i));
}
-void mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask)
+size_t mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask)
{
vector<thread> threads;
Atomic<size_t> next(0);
@@ -96,4 +107,8 @@ void mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask)
threads.emplace_back(mask_worker, &next, &seqs, &masking, hard_mask);
for (auto &t : threads)
t.join();
+ size_t n = 0;
+ for (size_t i = 0; i < seqs.get_length(); ++i)
+ n += std::count(seqs[i].data(), seqs[i].end(), value_traits.mask_char);
+ return n;
}
\ No newline at end of file
=====================================
src/basic/masking.h
=====================================
@@ -41,4 +41,4 @@ private:
char mask_table_x_[size], mask_table_bit_[size];
};
-void mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask = true);
\ No newline at end of file
+size_t mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask = true);
\ No newline at end of file
=====================================
src/data/ref_dictionary.cpp
=====================================
@@ -99,7 +99,7 @@ void ReferenceDictionary::build_lazy_dict(DatabaseFile &db_file)
}
db_file.rewind();
vector<unsigned> block_to_database_id;
- db_file.load_seqs(block_to_database_id, std::numeric_limits<size_t>::max(), false, &ref_seqs::data_, &ref_ids::data_, false, &filter);
+ db_file.load_seqs(block_to_database_id, std::numeric_limits<size_t>::max(), &ref_seqs::data_, &ref_ids::data_, false, &filter);
std::sort(m.begin(), m.end());
dict_to_lazy_dict_id_.clear();
dict_to_lazy_dict_id_.resize(dict_size);
=====================================
src/data/reference.cpp
=====================================
@@ -79,13 +79,13 @@ struct Pos_record
void DatabaseFile::init()
{
read_header(*this, ref_header);
- *this >> header2;
if (ref_header.build < min_build_required || ref_header.db_version < MIN_DB_VERSION)
throw std::runtime_error("Database was built with an older version of Diamond and is incompatible.");
if (ref_header.db_version > ReferenceHeader::current_db_version)
throw std::runtime_error("Database was built with a newer version of Diamond and is incompatible.");
if (ref_header.sequences == 0)
throw std::runtime_error("Incomplete database file. Database building did not complete successfully.");
+ *this >> header2;
pos_array_offset = ref_header.pos_array_offset;
}
@@ -268,7 +268,7 @@ void DatabaseFile::seek_direct() {
seek(sizeof(ReferenceHeader) + sizeof(ReferenceHeader2) + 8);
}
-bool DatabaseFile::load_seqs(vector<unsigned> &block_to_database_id, size_t max_letters, bool masked, Sequence_set **dst_seq, String_set<0> **dst_id, bool load_ids, const vector<bool> *filter)
+bool DatabaseFile::load_seqs(vector<unsigned> &block_to_database_id, size_t max_letters, Sequence_set **dst_seq, String_set<0> **dst_id, bool load_ids, const vector<bool> *filter)
{
task_timer timer("Loading reference sequences");
seek(pos_array_offset);
@@ -318,7 +318,6 @@ bool DatabaseFile::load_seqs(vector<unsigned> &block_to_database_id, size_t max_
(*dst_seq)->finish_reserve();
if(load_ids) (*dst_id)->finish_reserve();
seek(start_offset);
- size_t masked_letters = 0;
for (size_t n = 0; n < seqs; ++n) {
if (filter && filtered_pos[n]) seek(filtered_pos[n]);
@@ -329,16 +328,12 @@ bool DatabaseFile::load_seqs(vector<unsigned> &block_to_database_id, size_t max_
read((*dst_id)->ptr(n), (*dst_id)->length(n) + 1);
else
if (!seek_forward('\0')) throw std::runtime_error("Unexpected end of file.");
- if (config.masking == 1 && masked)
- Masking::get().bit_to_hard_mask((*dst_seq)->ptr(n), (*dst_seq)->length(n), masked_letters);
- else
- Masking::get().remove_bit_mask((*dst_seq)->ptr(n), (*dst_seq)->length(n));
+ Masking::get().remove_bit_mask((*dst_seq)->ptr(n), (*dst_seq)->length(n));
if (!config.sfilt.empty() && strstr((**dst_id)[n].c_str(), config.sfilt.c_str()) == 0)
memset((*dst_seq)->ptr(n), value_traits.mask_char, (*dst_seq)->length(n));
}
timer.finish();
(*dst_seq)->print_stats();
- log_stream << "Masked letters = " << masked_letters << endl;
blocked_processing = seqs_processed < ref_header.sequences;
return true;
=====================================
src/data/reference.h
=====================================
@@ -80,7 +80,7 @@ struct DatabaseFile : public InputFile
static DatabaseFile* auto_create_from_fasta();
static bool is_diamond_db(const string &file_name);
void rewind();
- bool load_seqs(vector<unsigned> &block_to_database_id, size_t max_letters, bool masked, Sequence_set **dst_seq, String_set<0> **dst_id, bool load_ids = true, const vector<bool> *filter = NULL);
+ bool load_seqs(vector<unsigned> &block_to_database_id, size_t max_letters, Sequence_set **dst_seq, String_set<0> **dst_id, bool load_ids = true, const vector<bool> *filter = NULL);
void get_seq();
void read_seq(string &id, vector<char> &seq);
bool has_taxon_id_lists();
=====================================
src/data/seed_array.cpp
=====================================
@@ -93,7 +93,7 @@ struct BuildCallback
};
template<typename _filter>
-SeedArray::SeedArray(const Sequence_set &seqs, size_t shape, const shape_histogram &hst, const SeedPartitionRange &range, const vector<size_t> seq_partition, const _filter *filter)
+SeedArray::SeedArray(const Sequence_set &seqs, size_t shape, const shape_histogram &hst, const SeedPartitionRange &range, const vector<size_t> &seq_partition, const _filter *filter)
{
task_timer timer("Allocating memory for seed array", 3);
for (size_t i = range.begin(); i < range.end(); ++i) {
@@ -108,6 +108,6 @@ SeedArray::SeedArray(const Sequence_set &seqs, size_t shape, const shape_histogr
seqs.enum_seeds(cb, seq_partition, shape, shape + 1, filter);
}
-template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector<size_t>, const No_filter *);
-template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector<size_t>, const Seed_set *);
-template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector<size_t>, const Hashed_seed_set *);
\ No newline at end of file
+template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector<size_t>&, const No_filter *);
+template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector<size_t>&, const Seed_set *);
+template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector<size_t>&, const Hashed_seed_set *);
\ No newline at end of file
=====================================
src/data/seed_array.h
=====================================
@@ -45,7 +45,7 @@ struct SeedArray
} PACKED_ATTRIBUTE;
template<typename _filter>
- SeedArray(const Sequence_set &seqs, size_t shape, const shape_histogram &hst, const SeedPartitionRange &range, const vector<size_t> seq_partition, const _filter *filter);
+ SeedArray(const Sequence_set &seqs, size_t shape, const shape_histogram &hst, const SeedPartitionRange &range, const vector<size_t> &seq_partition, const _filter *filter);
Entry* begin(unsigned i)
{
=====================================
src/dp/needleman_wunsch.cpp
=====================================
@@ -167,7 +167,7 @@ const Fixed_score_buffer<_score>& needleman_wunsch(sequence query, sequence subj
return mtx.score_buffer();
}
-template const Fixed_score_buffer<int>& needleman_wunsch(sequence query, sequence subject, int &max_score, const Local&, const int&);
+template const Fixed_score_buffer<int>& needleman_wunsch<int, Local>(sequence query, sequence subject, int &max_score, const Local&, const int&);
int needleman_wunsch(sequence query, sequence subject, int qbegin, int qend, int sbegin, int send, unsigned node, unsigned edge, Diag_graph &diags, bool log)
{
=====================================
src/dp/swipe/banded_3frame_swipe.cpp
=====================================
@@ -233,6 +233,26 @@ void banded_3frame_swipe(const TranslatedSequence &query, Strand strand, vector<
}
}
+template<typename _sv>
+void banded_3frame_swipe_targets(vector<DpTarget>::iterator begin,
+ vector<DpTarget>::iterator end,
+ bool score_only,
+ const TranslatedSequence &query,
+ Strand strand,
+ DpStat &stat,
+ bool parallel,
+ bool overflow_only)
+{
+ for (vector<DpTarget>::iterator i = begin; i < end; i += ScoreTraits<_sv>::CHANNELS) {
+ if (!overflow_only || i->overflow) {
+ if (score_only || config.disable_traceback)
+ banded_3frame_swipe<_sv, ScoreOnly>(query, strand, i, i + std::min(vector<DpTarget>::iterator::difference_type(ScoreTraits<_sv>::CHANNELS), end - i), stat, parallel);
+ else
+ banded_3frame_swipe<_sv, Traceback>(query, strand, i, i + std::min(vector<DpTarget>::iterator::difference_type(ScoreTraits<_sv>::CHANNELS), end - i), stat, parallel);
+ }
+ }
+}
+
void banded_3frame_swipe_worker(vector<DpTarget>::iterator begin,
vector<DpTarget>::iterator end,
Atomic<size_t> *next,
@@ -242,15 +262,12 @@ void banded_3frame_swipe_worker(vector<DpTarget>::iterator begin,
{
DpStat stat;
size_t pos;
- while (begin + (pos = next->post_add(config.swipe_chunk_size)) < end) {
- const auto e = min(begin + pos + config.swipe_chunk_size, end);
- for (vector<DpTarget>::iterator i = begin + pos; i < e; i += 8) {
- if (score_only || config.disable_traceback)
- banded_3frame_swipe<score_vector<int16_t>, ScoreOnly>(*query, strand, i, min(i + 8, end), stat, true);
- else
- banded_3frame_swipe<score_vector<int16_t>, Traceback>(*query, strand, i, min(i + 8, end), stat, true);
- }
- }
+ while (begin + (pos = next->post_add(config.swipe_chunk_size)) < end)
+#ifdef __SSE2__
+ banded_3frame_swipe_targets<score_vector<int16_t>>(begin + pos, min(begin + pos + config.swipe_chunk_size, end), score_only, *query, strand, stat, true, false);
+#else
+ banded_3frame_swipe_targets<int32_t>(begin + pos, min(begin + pos + config.swipe_chunk_size, end), score_only, *query, strand, stat, true, false);
+#endif
}
void banded_3frame_swipe(const TranslatedSequence &query, Strand strand, vector<DpTarget>::iterator target_begin, vector<DpTarget>::iterator target_end, DpStat &stat, bool score_only, bool parallel)
@@ -278,26 +295,11 @@ void banded_3frame_swipe(const TranslatedSequence &query, Strand strand, vector<
delete i->tmp;
}
}
- else {
- for (vector<DpTarget>::iterator i = target_begin; i < target_end; i += 8)
- if (score_only || config.disable_traceback)
- banded_3frame_swipe<score_vector<int16_t>, ScoreOnly>(query, strand, i, i + std::min(vector<DpTarget>::iterator::difference_type(8), target_end - i), stat, false);
- else
- banded_3frame_swipe<score_vector<int16_t>, Traceback>(query, strand, i, i + std::min(vector<DpTarget>::iterator::difference_type(8), target_end - i), stat, false);
- }
+ else
+ banded_3frame_swipe_targets<score_vector<int16_t>>(target_begin, target_end, score_only, query, strand, stat, false, false);
- for (vector<DpTarget>::iterator i = target_begin; i < target_end; ++i)
- if (i->overflow) {
- if (score_only || config.disable_traceback)
- banded_3frame_swipe<int32_t, ScoreOnly>(query, strand, i, i + 1, stat, false);
- else
- banded_3frame_swipe<int32_t, Traceback>(query, strand, i, i + 1, stat, false);
- }
+ banded_3frame_swipe_targets<int32_t>(target_begin, target_end, score_only, query, strand, stat, false, true);
#else
- for (vector<DpTarget>::iterator i = target_begin; i < target_end; ++i)
- if (score_only || config.disable_traceback)
- banded_3frame_swipe<int32_t, ScoreOnly>(query, strand, i, i + 1, stat, false);
- else
- banded_3frame_swipe<int32_t, Traceback>(query, strand, i, i + 1, stat, false);
+ banded_3frame_swipe_targets<int32_t>(target_begin, target_end, score_only, query, strand, stat, false, false);
#endif
}
\ No newline at end of file
=====================================
src/lib/tantan/LambdaCalculator.cc
=====================================
@@ -0,0 +1,408 @@
+// Copyright 2015 Yutaro Konta
+
+#include "LambdaCalculator.hh"
+#include <vector>
+#include <cassert>
+#include <cstdio> // sprintf
+#include <cstdlib>
+#include <cfloat>
+#include <cmath>
+//#include <iostream>
+using namespace std;
+
+static double roundToFewDigits(double x)
+{
+ // This rounding fixes some inaccuracies, e.g. for DNA with a simple
+ // match/mismatch matrix it's likely to make all the probabilities
+ // exactly 0.25, as they should be.
+ char buffer[32];
+ sprintf(buffer, "%g", x);
+ return atof(buffer);
+}
+
+static double** makematrix(int m, int n, double val)
+{
+ double** mat = new double* [m];
+ for (int i=0; i<m; i++)
+ {
+ mat[i] = new double [n];
+ for (int j=0; j<n; j++)
+ mat[i][j] = val;
+ }
+ return mat;
+}
+
+static void deletematrix(double** a, int m)
+{
+ for (int i=0; i<m; i++)
+ delete[] a[i];
+ delete[] a;
+}
+
+static double summatrix(double** a, int m, int n)
+{
+ double s = 0;
+ for (int i=0; i<m; i++)
+ for (int j=0; j<n; j++)
+ s += a[i][j];
+ return s;
+}
+
+static int max_index(double** a, int n, int i)
+{
+ double max = -DBL_MAX;
+ int maxindex = -1;
+
+ for (int j=i; j<n; j++)
+ {
+ if (fabs(a[j][i]) > max)
+ {
+ max = fabs(a[j][i]);
+ maxindex = j;
+ }
+ }
+ return maxindex;
+}
+
+static void swap_matrix(double** a, int i, int j)
+{
+ double* v = a[i];
+ a[i] = a[j];
+ a[j] = v;
+}
+
+static void swap_vector(int* a, int i, int j)
+{
+ int v = a[i];
+ a[i] = a[j];
+ a[j] = v;
+}
+
+static bool lu_pivoting(double** a, int* idx, int n)
+{
+ int v;
+
+ for (int i=0; i<n; i++)
+ idx[i] = i;
+
+ for (int i=0; i<n; i++)
+ {
+ v = max_index(a, n, i);
+ assert(v >= 0);
+ if (fabs(a[v][i]) < 1e-10)
+ {
+ return false; // singular matrix!
+ }
+
+ swap_matrix(a, i, v);
+ swap_vector(idx, i, v);
+
+ a[i][i] = 1.0/a[i][i];
+ for (int j=i+1; j<n; j++)
+ {
+ a[j][i] = a[j][i] * a[i][i];
+ for (int k=i+1; k<n; k++)
+ a[j][k] = a[j][k] - a[j][i] * a[i][k];
+ }
+ }
+ return true;
+}
+
+static void solvep(double **a, double *x, double *b, int n)
+{
+ double *y = new double [n];
+
+ for (int i=0; i<n; i++)
+ {
+ y[i] = b[i];
+ for (int j=0; j<i; j++)
+ y[i] -= a[i][j] * y[j];
+ }
+
+ for (int i=n-1; i>=0; i--)
+ {
+ x[i] = y[i];
+ for (int j=i+1; j<n; j++)
+ x[i] -= a[i][j] * x[j];
+ x[i] *= a[i][i]; // needed because a[i][i] is inverted
+ }
+ delete[] y;
+}
+
+static void transpose(double **a, int n)
+{
+ double v;
+ for (int i=0; i<n; i++)
+ {
+ for (int j=0; j<i; j++)
+ {
+ v = a[i][j];
+ a[i][j] = a[j][i];
+ a[j][i] = v;
+ }
+ }
+}
+
+static bool invert(double **a, double **inv, int n)
+{
+ int* idx = new int [n];
+
+ double **e = makematrix(n,n,0);
+
+ if(!lu_pivoting(a, idx, n))
+ return false;
+
+ for (int i=0; i<n; i++)
+ e[idx[i]][i] = 1; // transposed
+
+ delete[] idx;
+
+ for (int i=0; i<n; i++)
+ solvep(a, inv[i], e[i], n);
+
+ deletematrix(e, n);
+ transpose(inv, n); // transpose inv to make the inverted matrix of a
+ return true;
+}
+
+static bool calculate_inv_sum(double **matrix, int alpha_size, double tau, double* inv_sum)
+{
+ double **m = makematrix(alpha_size, alpha_size, 0);
+ double **y = makematrix(alpha_size, alpha_size, 0);
+
+ for (int i=0; i<alpha_size; i++)
+ for (int j=0; j<alpha_size; j++)
+ m[i][j] = exp(tau * matrix[i][j]);
+
+ if(!invert(m, y, alpha_size))
+ return false;
+
+ *inv_sum = summatrix(y, alpha_size, alpha_size);
+
+ deletematrix(m, alpha_size);
+ deletematrix(y, alpha_size);
+ return true;
+}
+
+namespace cbrc{
+
+void LambdaCalculator::setBad(){
+ lambda_ = -1;
+ letterProbs1_.clear();
+ letterProbs2_.clear();
+}
+
+bool LambdaCalculator::find_ub(double **matrix, int alpha_size, double *ub)
+{
+ double r_max_min = DBL_MAX;
+ double c_max_min = DBL_MAX;
+ double r_max;
+ double c_max;
+
+ double r_min;
+ double c_min;
+
+ int l_r = 0;
+ int l_c = 0;
+
+ for (int i=0; i<alpha_size; i++)
+ {
+ r_max = -DBL_MAX;
+ r_min = DBL_MAX;
+ for (int j=0; j<alpha_size; j++)
+ {
+ if (matrix[i][j] > r_max)
+ r_max = matrix[i][j];
+
+ if (matrix[i][j] < r_min)
+ r_min = matrix[i][j];
+ }
+ if (r_max == 0 && r_min == 0)
+ l_r++;
+ else if (r_max <= 0 || r_min >= 0)
+ return false;
+ else if (r_max < r_max_min)
+ r_max_min = r_max;
+ }
+
+ for (int j=0; j<alpha_size; j++)
+ {
+ c_max = -DBL_MAX;
+ c_min = DBL_MAX;
+ for (int i=0; i<alpha_size; i++)
+ {
+ if (matrix[i][j] > c_max)
+ c_max = matrix[i][j];
+
+ if (matrix[i][j] < c_min)
+ c_min = matrix[i][j];
+ }
+ if (c_max == 0 && c_min == 0)
+ l_c++;
+ else if (c_max <= 0 || c_min >= 0)
+ return false;
+ else if (c_max < c_max_min)
+ c_max_min = c_max;
+ }
+
+ if (l_r == alpha_size) return false;
+
+ // the multiplication by 1.1 is sometimes necessary, presumably to
+ // prevent the upper bound from being too tight:
+ if (r_max_min > c_max_min)
+ *ub = 1.1 * log(1.0 * (alpha_size - l_r))/r_max_min;
+ else
+ *ub = 1.1 * log(1.0 * (alpha_size - l_c))/c_max_min;
+
+ return true;
+}
+
+bool LambdaCalculator::binary_search(double** matrix, int alpha_size, double lb, double ub, vector<double>& letprob1, vector<double>& letprob2, double* lambda, int maxiter)
+{
+ double l=0;
+ double r=0;
+ double l_sum=0;
+ double r_sum=0;
+ int iter=0;
+
+ while (iter < maxiter && (l>=r || (l_sum < 1.0 && r_sum < 1.0) || (l_sum > 1.0 && r_sum > 1.0)))
+ {
+ l = lb + (ub - lb)*rand()/RAND_MAX;
+ r = lb + (ub - lb)*rand()/RAND_MAX;
+
+ if (!calculate_inv_sum(matrix, alpha_size, l, &l_sum) || !calculate_inv_sum(matrix, alpha_size, r, &r_sum))
+ {
+ l = 0;
+ r = 0;
+ }
+ iter++;
+ }
+
+ if (iter >= maxiter)
+ return false;
+
+ while (l_sum != 1.0 && r_sum != 1.0 && (l+r)/2.0 != l && (l+r)/2.0 != r)
+ {
+ double mid = (l + r)/2.0;
+ double mid_sum;
+ if (!calculate_inv_sum(matrix, alpha_size, mid, &mid_sum))
+ return false;
+
+ if (fabs(mid_sum) >= DBL_MAX)
+ return false;
+
+ if ((l_sum < 1.0 && mid_sum >= 1.0) || (l_sum > 1.0 && mid_sum <= 1.0))
+ {
+ r = mid;
+ r_sum = mid_sum;
+ }
+
+ else if ((r_sum < 1.0 && mid_sum >= 1.0) || (r_sum > 1.0 && mid_sum <= 1.0))
+ {
+ l = mid;
+ l_sum = mid_sum;
+ }
+
+ else
+ return false;
+ }
+
+ if (fabs(l_sum - 1.0) < fabs(r_sum - 1.0))
+ {
+ if (check_lambda(matrix, l, alpha_size, letprob1, letprob2))
+ {
+ *lambda = l;
+ return true;
+ }
+ return false;
+ }
+
+ if (check_lambda(matrix, r, alpha_size, letprob1, letprob2))
+ {
+ *lambda = r;
+ return true;
+ }
+ return false;
+}
+
+double LambdaCalculator::calculate_lambda(double** matrix, int alpha_size, vector<double>& letprob1, vector<double>& letprob2, int maxiter, int max_boundary_search_iter, double lb_ratio)
+{
+ double ub;
+
+ if (!find_ub(matrix, alpha_size, &ub))
+ return -1;
+
+ double lb = ub*lb_ratio;
+ double lambda = -1;
+ int iter = 0;
+ bool flag = false;
+
+ while (!flag && iter < maxiter)
+ {
+ flag = binary_search(matrix, alpha_size, lb, ub, letprob1, letprob2, &lambda, max_boundary_search_iter);
+ iter++;
+ }
+
+ return lambda;
+}
+
+bool LambdaCalculator::check_lambda(double** matrix, double lambda, int alpha_size, vector<double>& letprob1, vector<double>& letprob2)
+{
+ double **m = makematrix(alpha_size, alpha_size, 0);
+ double **y = makematrix(alpha_size, alpha_size, 0);
+
+ for (int i=0; i<alpha_size; i++)
+ for (int j=0; j<alpha_size; j++)
+ m[i][j] = exp(lambda * matrix[i][j]);
+
+ invert(m, y, alpha_size);
+
+ for (int i=0; i<alpha_size; i++)
+ {
+ double p = 0;
+ for (int j=0;j<alpha_size; j++)
+ p += y[i][j];
+ if (p < 0 || p > 1)
+ {
+ letprob2.clear();
+ return false;
+ }
+ letprob2.push_back(roundToFewDigits(p));
+ }
+
+ for (int j=0; j<alpha_size; j++)
+ {
+ double q = 0;
+ for (int i=0; i<alpha_size; i++)
+ q += y[i][j];
+ if (q < 0 || q > 1)
+ {
+ letprob2.clear();
+ letprob1.clear();
+ return false;
+ }
+ letprob1.push_back(roundToFewDigits(q));
+ }
+
+ deletematrix(m, alpha_size);
+ deletematrix(y, alpha_size);
+
+ return true;
+}
+
+void LambdaCalculator::calculate(const const_int_ptr *matrix, int alphSize) {
+ assert(alphSize >= 0);
+ setBad();
+
+ int maxiter = 1000;
+ int max_boundary_search_iter = 100;
+ double lb_ratio = 1e-6;
+
+ double** mat = makematrix(alphSize, alphSize, 0);
+ for (int i=0; i<alphSize; i++)
+ for (int j=0; j<alphSize; j++)
+ mat[i][j] = matrix[i][j];
+ lambda_ = calculate_lambda(mat, alphSize, letterProbs1_, letterProbs2_, maxiter, max_boundary_search_iter, lb_ratio);
+ deletematrix(mat, alphSize);
+}
+}
=====================================
src/lib/tantan/LambdaCalculator.hh
=====================================
@@ -0,0 +1,58 @@
+// Copyright 2008 Michiaki Hamada
+// Modified 2015 Yutaro Konta
+
+// This class calculates the scale factor (lambda), and the letter
+// probabilities, that are implicit in a scoring matrix. The
+// calculation might fail for various reasons, putting it into a
+// "bad/undefined" state.
+
+// If the score matrix is symmetric, then the two sets of letter
+// probabilities should be identical. With this code, they might
+// differ minutely from exact identity.
+
+#ifndef LAMBDA_CALCULATOR_HH
+#define LAMBDA_CALCULATOR_HH
+
+#include <vector>
+
+namespace cbrc{
+
+typedef const int *const_int_ptr;
+
+class LambdaCalculator{
+ public:
+ LambdaCalculator() { setBad(); }
+
+ void calculate(const const_int_ptr *matrix, int alphSize);
+
+ // Put us in the bad/undefined state.
+ void setBad();
+
+ // Are we in the bad/undefined state?
+ bool isBad() const { return (lambda_ < 0); }
+
+ // The scale factor. In the bad/undefined state, it is negative.
+ double lambda() const { return lambda_; }
+
+ // The probabilities of letters corresponding to matrix rows (1st index).
+ // In the bad/undefined state, it is a null pointer.
+ const double *letterProbs1() const {return isBad() ? 0 : &letterProbs1_[0];}
+
+ // The probabilities of letters corresponding to matrix columns (2nd index).
+ // In the bad/undefined state, it is a null pointer.
+ const double *letterProbs2() const {return isBad() ? 0 : &letterProbs2_[0];}
+
+ private:
+ double lambda_;
+ std::vector<double> letterProbs1_;
+ std::vector<double> letterProbs2_;
+
+ bool find_ub(double **matrix, int alpha_size, double *ub);
+ bool binary_search(double** matrix, int alpha_size, double lb, double ub, std::vector<double>& letprob1, std::vector<double>& letprob2, double* lambda, int maxiter);
+ double calculate_lambda(double** matrix, int alpha_size, std::vector<double>& letprob1, std::vector<double>& letprob2, int maxiter, int max_boundary_search_iter, double lb_ratio);
+ bool check_lambda(double** matrix, double lambda, int alpha_size, std::vector<double>& letprob1, std::vector<double>& letprob2);
+};
+
+} // end namespace
+
+#endif
=====================================
src/run/cluster.cpp
=====================================
@@ -121,7 +121,7 @@ void run() {
String_set<0> *rep_ids;
vector<unsigned> rep_database_id, rep_block_id(seq_count);
db->rewind();
- db->load_seqs(rep_database_id, (size_t)1e11, true, &rep_seqs, &rep_ids, true, &rep2);
+ db->load_seqs(rep_database_id, (size_t)1e11, &rep_seqs, &rep_ids, true, &rep2);
for (size_t i = 0; i < rep_database_id.size(); ++i)
rep_block_id[rep_database_id[i]] = (unsigned)i;
=====================================
src/run/double_indexed.cpp
=====================================
@@ -129,7 +129,16 @@ void run_ref_chunk(DatabaseFile &db_file,
const vector<unsigned> &block_to_database_id)
{
log_stream << "Current RSS: " << getCurrentRSS() << ", Peak RSS: " << getPeakRSS() << endl;
- task_timer timer("Building reference histograms");
+
+ task_timer timer;
+ if (config.masking == 1) {
+ timer.go("Masking reference");
+ size_t n = mask_seqs(*ref_seqs::data_, Masking::get());
+ timer.finish();
+ log_stream << "Masked letters: " << n << endl;
+ }
+
+ timer.go("Building reference histograms");
if(config.algo==Config::query_indexed)
ref_hst = Partitioned_histogram(*ref_seqs::data_, false, query_seeds);
else if(query_seeds_hashed != 0)
@@ -237,7 +246,6 @@ void run_query_chunk(DatabaseFile &db_file,
for (current_ref_block = 0; db_file.load_seqs(block_to_database_id,
(size_t)(config.chunk_size*1e9),
- true,
&ref_seqs::data_,
&ref_ids::data_,
true,
@@ -308,7 +316,6 @@ void master_thread(DatabaseFile *db_file, Timer &total_timer, Metadata &metadata
db_file->seek_seq(query_file_offset);
if (!db_file->load_seqs(query_block_to_database_id,
(size_t)(config.chunk_size * 1e9),
- true,
&query_seqs::data_,
&query_ids::data_,
true,
=====================================
src/run/tools.cpp
=====================================
@@ -46,7 +46,7 @@ void random_seqs()
{
DatabaseFile db_file(config.database);
vector<unsigned> v;
- db_file.load_seqs(v, std::numeric_limits<size_t>::max(), true, &ref_seqs::data_, &ref_ids::data_);
+ db_file.load_seqs(v, std::numeric_limits<size_t>::max(), &ref_seqs::data_, &ref_ids::data_);
cout << "Sequences = " << ref_seqs::get().get_length() << endl;
std::set<unsigned> n;
const size_t count = atoi(config.seq_no[0].c_str());
@@ -89,7 +89,7 @@ void db_stat()
{
DatabaseFile db_file(config.database);
vector<unsigned> v;
- db_file.load_seqs(v, std::numeric_limits<size_t>::max(), true, &ref_seqs::data_, &ref_ids::data_);
+ db_file.load_seqs(v, std::numeric_limits<size_t>::max(), &ref_seqs::data_, &ref_ids::data_);
cout << "Sequences = " << ref_seqs::get().get_length() << endl;
size_t letters = 0;
@@ -116,6 +116,7 @@ void run_masker()
vector<Letter> seq, seq2;
vector<char> id;
const FASTA_format format;
+ size_t letters = 0, seqs = 0, seqs_total = 0;
while (format.get_seq(id, seq, f)) {
cout << '>' << string(id.data(), id.size()) << endl;
seq2 = seq;
@@ -128,6 +129,16 @@ void run_masker()
}
cout << endl;*/
cout << sequence(seq2.data(), seq2.size()) << endl;
+ size_t n = 0;
+ for (size_t i = 0; i < seq2.size(); ++i)
+ if (seq2[i] == value_traits.mask_char) {
+ ++n;
+ }
+ letters += n;
+ if (n > 0)
+ ++seqs;
+ ++seqs_total;
+ cerr << "#Sequences: " << seqs << "/" << seqs_total << ", #Letters: " << letters << endl;
}
}
@@ -159,7 +170,7 @@ void test_io()
DatabaseFile db(config.database);
vector<unsigned> v;
- db.load_seqs(v, std::numeric_limits<size_t>::max(), true, &ref_seqs::data_, &ref_ids::data_);
+ db.load_seqs(v, std::numeric_limits<size_t>::max(), &ref_seqs::data_, &ref_ids::data_);
size_t total = ref_seqs::get().raw_len() + ref_ids::get().raw_len();
cout << "MBytes/sec = " << total / 1e6 / t.getElapsedTime() << endl;
=====================================
src/util/command_line_parser.cpp
=====================================
@@ -99,7 +99,7 @@ void Command_line_parser::store(int count, const char ** str, unsigned &command)
void Command_line_parser::print_help()
{
- static const size_t col1_width = 23;
+ static const size_t col1_width = 25;
cout << "Syntax: diamond COMMAND [OPTIONS]" << endl << endl;
cout << "Commands:" << endl;
for (vector<pair<string, string> >::const_iterator i = commands_.begin(); i != commands_.end(); ++i)
=====================================
src/util/io/file_backed_buffer.h
=====================================
@@ -27,7 +27,7 @@ struct FileBackedBuffer : public TempFile, public InputFile
FileBackedBuffer():
TempFile(),
- InputFile(*dynamic_cast<OutputFile*>(this))
+ InputFile(*dynamic_cast<TempFile*>(this))
{
}
=====================================
src/util/io/input_file.cpp
=====================================
@@ -29,6 +29,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "compressed_stream.h"
#include "input_stream_buffer.h"
#include "../../basic/config.h"
+#include "temp_file.h"
bool is_gzip_stream(const unsigned char *b)
{
@@ -40,7 +41,8 @@ bool is_gzip_stream(const unsigned char *b)
InputFile::InputFile(const string &file_name, int flags) :
Deserializer(new InputStreamBuffer(new FileSource(file_name))),
- file_name(file_name)
+ file_name(file_name),
+ unlinked(false)
{
if (file_name.empty() || file_name == "-")
return;
@@ -64,9 +66,10 @@ InputFile::InputFile(const string &file_name, int flags) :
buffer_ = new InputStreamBuffer(new ZlibSource(buffer_));
}
-InputFile::InputFile(OutputFile &tmp_file, int flags) :
+InputFile::InputFile(TempFile &tmp_file, int flags) :
Deserializer(new InputStreamBuffer(new FileSource(tmp_file.file_name(), tmp_file.file()))),
- file_name(tmp_file.file_name())
+ file_name(tmp_file.file_name()),
+ unlinked(tmp_file.unlinked)
{
tmp_file.rewind();
}
@@ -74,11 +77,6 @@ InputFile::InputFile(OutputFile &tmp_file, int flags) :
void InputFile::close_and_delete()
{
close();
-#ifdef WIN32
- bool windows = true;
-#else
- bool windows = false;
-#endif
- if ((windows || config.no_unlink) && remove(file_name.c_str()) != 0)
+ if (!unlinked && remove(file_name.c_str()) != 0)
std::cerr << "Warning: Failed to delete temporary file " << file_name << std::endl;
}
\ No newline at end of file
=====================================
src/util/io/input_file.h
=====================================
@@ -29,16 +29,19 @@ using std::vector;
using std::string;
using std::runtime_error;
+struct TempFile;
+
struct InputFile : public Deserializer
{
enum { BUFFERED = 1 };
InputFile(const string &file_name, int flags = 0);
- InputFile(OutputFile &tmp_file, int flags = 0);
+ InputFile(TempFile &tmp_file, int flags = 0);
void close_and_delete();
string file_name;
+ bool unlinked;
};
=====================================
src/util/io/temp_file.cpp
=====================================
@@ -35,6 +35,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "input_file.h"
using std::vector;
+using std::string;
unsigned TempFile::n = 0;
uint64_t TempFile::hash_key;
@@ -61,6 +62,7 @@ pair<string, int> TempFile::init()
#endif
#ifdef _MSC_VER
+ unlinked = false;
return string(s);
#else
int fd = mkstemp(s);
@@ -68,10 +70,10 @@ pair<string, int> TempFile::init()
perror(0);
throw std::runtime_error(string("Error opening temporary file ") + string(s));
}
- if (!config.no_unlink && unlink(s) < 0) {
- perror(0);
- throw std::runtime_error("Error calling unlink.");
- }
+ if (config.no_unlink)
+ unlinked = false;
+ else
+ unlinked = (unlink(s) >= 0);
return std::make_pair(string(s), fd);
#endif
}
=====================================
src/util/io/temp_file.h
=====================================
@@ -22,22 +22,21 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include <utility>
#include "output_file.h"
-using std::string;
-
struct TempFile : public OutputFile
{
TempFile();
- static string get_temp_dir();
+ static std::string get_temp_dir();
static unsigned n;
static uint64_t hash_key;
+ bool unlinked;
private:
#ifdef _MSC_VER
- static string init();
+ std::string init();
#else
- static std::pair<string, int> init();
+ std::pair<std::string, int> init();
#endif
};
View it on GitLab: https://salsa.debian.org/med-team/diamond-aligner/compare/7c934577bb3424154f24c391837747555f1a4421...e8ce40aecae50c7f3e5ec40044714729070873f0
--
View it on GitLab: https://salsa.debian.org/med-team/diamond-aligner/compare/7c934577bb3424154f24c391837747555f1a4421...e8ce40aecae50c7f3e5ec40044714729070873f0
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190921/71169fea/attachment-0001.html>
More information about the debian-med-commit
mailing list