[med-svn] [Git][med-team/diamond-aligner][upstream] New upstream version 0.9.24+dfsg

Sat Dec 29 20:21:25 GMT 2018

Andreas Tille pushed to branch upstream at Debian Med / diamond-aligner


Commits:
0238729e by Andreas Tille at 2018-12-29T20:10:17Z
New upstream version 0.9.24+dfsg
- - - - -


29 changed files:

- CMakeLists.txt
- README.md
- build_simple.sh
- src/ChangeLog
- src/align/align.cpp
- src/basic/basic.cpp
- src/basic/config.cpp
- src/basic/config.h
- src/basic/const.h
- src/basic/sequence.h
- src/data/queries.cpp
- src/data/queries.h
- src/data/reference.cpp
- src/data/reference.h
- src/dp/score_vector.h
- src/output/blast_tab_format.cpp
- src/output/output_format.h
- src/output/target_culling.h
- src/run/main.cpp
- src/run/tools.cpp
- src/util/command_line_parser.cpp
- src/util/io/file_source.cpp
- src/util/io/input_file.cpp
- src/util/log_stream.h
- + src/util/sequence/sequence.cpp
- + src/util/sequence/sequence.h
- src/util/system/system.cpp
- src/util/util.cpp
- src/util/util.h


Changes:

=====================================
CMakeLists.txt
=====================================
@@ -3,6 +3,18 @@ project (DIAMOND)
 
 option(BUILD_STATIC "BUILD_STATIC" OFF)
 option(EXTRA "EXTRA" OFF)
+option(STATIC_LIBGCC "STATIC_LIBGCC" OFF)
+option(STATIC_LIBSTDC++ "STATIC_LIBSTDC++" OFF)
+option(SSSE3 "SSSE3" OFF)
+option(POPCNT "POPCNT" OFF)
+
+IF(STATIC_LIBSTDC++)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++")
+endif()
+
+IF(STATIC_LIBGCC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libgcc")
+endif()
 
 if(BUILD_STATIC)
     set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
@@ -12,7 +24,17 @@ endif()
 
 set(CMAKE_CXX_STANDARD 11)
 
-if(CMAKE_BUILD_MARCH)
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
+else()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++11")
+endif()
+
+if(SSSE3)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mssse3")
+  if(POPCNT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt")
+  endif()
+elseif(CMAKE_BUILD_MARCH)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${CMAKE_BUILD_MARCH}")
 else()
   include(CheckCXXCompilerFlag)
@@ -123,6 +145,7 @@ add_executable(diamond src/run/main.cpp
   src/run/cluster.cpp
   src/util/algo/greedy_vortex_cover.cpp
   src/util/algo/greedy_vortex_cover_weighted.cpp
+  src/util/sequence/sequence.cpp
 )
 
 if(EXTRA)


=====================================
README.md
=====================================
@@ -32,7 +32,7 @@ quick example for setting up and using the program on Linux.
 Installing the software on your system may be done by downloading it in
 binary format for immediate use:
 
-    wget http://github.com/bbuchfink/diamond/releases/download/v0.9.23/diamond-linux64.tar.gz
+    wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
     tar xzf diamond-linux64.tar.gz
 
 The extracted `diamond` binary file should be moved to a directory


=====================================
build_simple.sh
=====================================
@@ -80,4 +80,5 @@ g++ -DNDEBUG -O3 -Wno-deprecated-declarations $1 $2 $3 \
   src/run/cluster.cpp \
   src/util/algo/greedy_vortex_cover.cpp \
   src/util/algo/greedy_vortex_cover_weighted.cpp \
+  src/util/sequence/sequence.cpp \
 -lz -lpthread -o diamond


=====================================
src/ChangeLog
=====================================
@@ -1,3 +1,16 @@
+[0.9.24]
+- Fixed a compiler error on macOS.
+- Added --header option to output header for tabular output format.
+- The quality string output in tabular format (qqual field) is clipped to the aligned part of the query.
+- Print '*' as quality string if quality values are not available in tabular output format.
+- Added field 'full_qqual' to print unclipped query quality values to the tabular format.
+- Added field 'full_qseq' to print unclipped query sequence to the tabular format.
+- Added support for using the hyphen character '-' to denote the standard input for input file parameters.
+- Status messages are written to stderr.
+- Fixed a bug that could incorrectly report queries as unaligned in the output of the --un option.
+- Added option '--al' to write aligned queries to file.
+- Added options '--alfmt' and '--unfmt' to set the format of the aligned/unaligned query file (supported values: fasta, fastq).
+
 [0.9.23]
 - Fixed an issue that could cause too high memory usage.
 - Added output field 'qqual' to print query FASTQ quality values to the tabular format.


=====================================
src/align/align.cpp
=====================================
@@ -105,8 +105,11 @@ void align_worker(size_t thread_id, const Parameters *params, const Metadata *me
 		if (*output_format != Output_format::null) {
 			buf = new TextBuffer;
 			const bool aligned = mapper->generate_output(*buf, stat, *metadata);
-			if (aligned && (!config.unaligned.empty() || !config.aligned_file.empty()))
+			if (aligned && (!config.unaligned.empty() || !config.aligned_file.empty())) {
+				query_aligned_mtx.lock();
 				query_aligned[hits.query] = true;
+				query_aligned_mtx.unlock();
+			}
 		}
 		delete mapper;
 		OutputSink::get().push(hits.query, buf);


=====================================
src/basic/basic.cpp
=====================================
@@ -24,7 +24,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "sequence.h"
 #include "masking.h"
 
-const char* Const::version_string = "0.9.23";
+const char* Const::version_string = "0.9.24";
 const char* Const::program_name = "diamond";
 const char* Const::id_delimiters = " \a\b\f\n\r\t\v\1";
 


=====================================
src/basic/config.cpp
=====================================
@@ -70,7 +70,8 @@ Config::Config(int argc, const char **argv)
 		.add_command("seed-stat", "")
 		.add_command("smith-waterman", "")
 		.add_command("protein-snps", "")
-		.add_command("cluster", "");
+		.add_command("cluster", "")
+		.add_command("translate", "");
 
 	Options_group general("General options");
 	general.add()
@@ -94,8 +95,9 @@ Config::Config(int argc, const char **argv)
 \tsstart means Start of alignment in subject\n\
 \tsend means End of alignment in subject\n\
 \tqseq means Aligned part of query sequence\n\
+\tfull_qseq means Query sequence\n\
 \tsseq means Aligned part of subject sequence\n\
-\tfull_sseq means Full subject sequence\n\
+\tfull_sseq means Subject sequence\n\
 \tevalue means Expect value\n\
 \tbitscore means Bit score\n\
 \tscore means Raw score\n\
@@ -118,7 +120,8 @@ Config::Config(int argc, const char **argv)
 \n\tDefault: qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore", output_format)
 ("verbose", 'v', "verbose console output", verbose)
 ("log", 0, "enable debug log", debug_log)
-("quiet", 0, "disable console output", quiet);
+("quiet", 0, "disable console output", quiet)
+("header", 0, "Write header lines to blast tabular format.", output_header);
 
 	Options_group makedb("Makedb options");
 	makedb.add()
@@ -130,6 +133,8 @@ Config::Config(int argc, const char **argv)
 		("strand", 0, "query strands to search (both/minus/plus)", query_strands, string("both"))
 		("un", 0, "file for unaligned queries", unaligned)
 		("al", 0, "file or aligned queries", aligned_file)
+		("unfmt", 0, "format of unaligned query file (fasta/fastq)", unfmt, string("fasta"))
+		("alfmt", 0, "format of aligned query file (fasta/fastq)", alfmt, string("fasta"))
 		("unal", 0, "report unaligned queries (0=no, 1=yes)", report_unaligned, -1)
 		("max-target-seqs", 'k', "maximum number of target sequences to report alignments for", max_alignments, uint64_t(25))
 		("top", 0, "report alignments within this percentage range of top alignment score (overrides --max-target-seqs)", toppercent, 100.0)
@@ -342,7 +347,7 @@ Config::Config(int argc, const char **argv)
 		;
 	}
 
-	invocation = join(' ', vector<string>(&argv[0], &argv[argc]));
+	invocation = join(" ", vector<string>(&argv[0], &argv[argc]));
 	log_stream << invocation << endl;
 
 	if (!no_auto_append) {
@@ -423,6 +428,11 @@ Config::Config(int argc, const char **argv)
 	if (query_strands != "both" && query_strands != "minus" && query_strands != "plus")
 		throw std::runtime_error("Invalid value for parameter --strand");
 
+	if (unfmt == "fastq" || alfmt == "fastq")
+		store_query_quality = true;
+	if (!aligned_file.empty())
+		log_stream << "Aligned file format: " << alfmt << endl;
+
 	/*log_stream << "sizeof(hit)=" << sizeof(hit) << " sizeof(packed_uint40_t)=" << sizeof(packed_uint40_t)
 		<< " sizeof(sorted_list::entry)=" << sizeof(sorted_list::entry) << endl;*/
 


=====================================
src/basic/config.h
=====================================
@@ -168,11 +168,14 @@ struct Config
 	unsigned swipe_chunk_size;
 	unsigned query_parallel_limit;
 	bool long_reads;
+	bool output_header;
+	string alfmt;
+	string unfmt;
 
 	enum {
 		makedb = 0, blastp = 1, blastx = 2, view = 3, help = 4, version = 5, getseq = 6, benchmark = 7, random_seqs = 8, compare = 9, sort = 10, roc = 11, db_stat = 12, model_sim = 13,
 		match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta = 18, dbinfo = 19, test_extra = 20, test_io = 21, db_annot_stats = 22, read_sim = 23, info = 24, seed_stat = 25,
-		smith_waterman = 26, protein_snps = 27, cluster = 28
+		smith_waterman = 26, protein_snps = 27, cluster = 28, translate = 29
 	};
 	unsigned	command;
 


=====================================
src/basic/const.h
=====================================
@@ -23,7 +23,7 @@ struct Const
 {
 
 	enum {
-		build_version = 124,
+		build_version = 125,
 		seedp_bits = 10,
 		seedp = 1<<seedp_bits,
 		max_seed_weight = 32,


=====================================
src/basic/sequence.h
=====================================
@@ -86,6 +86,9 @@ struct sequence
 	}
 	bool empty() const
 	{ return len_ == 0; }
+	bool present() const {
+		return len_ > 0;
+	}
 	const char* c_str() const
 	{ return reinterpret_cast<const char*>(data_); }
 	size_t print(char *ptr, unsigned begin, unsigned len) const


=====================================
src/data/queries.cpp
=====================================
@@ -17,6 +17,10 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
 #include "queries.h"
+#include "../util/sequence/sequence.h"
+#include "../basic/config.h"
+
+using namespace std;
 
 unsigned current_query_chunk;
 Sequence_set* query_source_seqs::data_ = 0;
@@ -24,6 +28,7 @@ Sequence_set* query_seqs::data_ = 0;
 String_set<0>* query_ids::data_ = 0;
 Partitioned_histogram query_hst;
 vector<bool> query_aligned;
+tthread::mutex query_aligned_mtx;
 Seed_set *query_seeds = 0;
 Hashed_seed_set *query_seeds_hashed = 0;
 String_set<0> *query_qual = nullptr;
@@ -35,11 +40,12 @@ void write_unaligned(OutputFile *file)
 	TextBuffer buf;
 	for (size_t i = 0; i < n; ++i) {
 		if (!query_aligned[i]) {
-			buf << '>' << query_ids::get()[i].c_str() << '\n';
-			(align_mode.query_translated ? query_source_seqs::get()[i] : query_seqs::get()[i]).print(buf, input_value_traits);
-			buf << '\n';
-			file->write(buf.get_begin(), buf.size());
-			buf.clear();
+			Util::Sequence::format(align_mode.query_translated ? query_source_seqs::get()[i] : query_seqs::get()[i],
+				query_ids::get()[i].c_str(),
+				query_qual ? (*query_qual)[i].c_str() : nullptr,
+				*file,
+				config.unfmt,
+				input_value_traits);
 		}
 	}
 }
@@ -50,11 +56,12 @@ void write_aligned(OutputFile *file)
 	TextBuffer buf;
 	for (size_t i = 0; i < n; ++i) {
 		if (query_aligned[i]) {
-			buf << '>' << query_ids::get()[i].c_str() << '\n';
-			(align_mode.query_translated ? query_source_seqs::get()[i] : query_seqs::get()[i]).print(buf, input_value_traits);
-			buf << '\n';
-			file->write(buf.get_begin(), buf.size());
-			buf.clear();
+			Util::Sequence::format(align_mode.query_translated ? query_source_seqs::get()[i] : query_seqs::get()[i],
+				query_ids::get()[i].c_str(),
+				query_qual ? (*query_qual)[i].c_str() : nullptr,
+				*file,
+				config.alfmt,
+				input_value_traits);
 		}
 	}
 }
\ No newline at end of file


=====================================
src/data/queries.h
=====================================
@@ -25,6 +25,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../basic/statistics.h"
 #include "sequence_set.h"
 #include "seed_set.h"
+#include "../util/tinythread.h"
 
 extern Partitioned_histogram query_hst;
 extern unsigned current_query_chunk;
@@ -50,6 +51,7 @@ struct query_ids
 	static String_set<0> *data_;
 };
 
+extern tthread::mutex query_aligned_mtx;
 extern vector<bool> query_aligned;
 extern String_set<0> *query_qual;
 


=====================================
src/data/reference.cpp
=====================================
@@ -401,12 +401,19 @@ void db_info()
 	db_file.close();
 }
 
-DatabaseFile* DatabaseFile::auto_create_from_fasta() {
-	InputFile db_file(config.database);
+bool DatabaseFile::is_diamond_db(const string &file_name) {
+	if (file_name == "-")
+		return false;
+	InputFile db_file(file_name);
 	uint64_t magic_number;
-	if (db_file.read(&magic_number, 1) != 1 || magic_number != ReferenceHeader::MAGIC_NUMBER) {
+	bool r = db_file.read(&magic_number, 1) == 1 && magic_number == ReferenceHeader::MAGIC_NUMBER;
+	db_file.close();
+	return r;
+}
+
+DatabaseFile* DatabaseFile::auto_create_from_fasta() {
+	if (!is_diamond_db(config.database)) {
 		message_stream << "Database file is not a DIAMOND database, treating as FASTA." << endl;
-		db_file.close();
 		config.input_ref_file = config.database;
 		TempFile *db;
 		make_db(&db);
@@ -414,8 +421,6 @@ DatabaseFile* DatabaseFile::auto_create_from_fasta() {
 		delete db;
 		return r;
 	}
-	else {
-		db_file.close();
+	else
 		return new DatabaseFile(config.database);
-	}
 }
\ No newline at end of file


=====================================
src/data/reference.h
=====================================
@@ -22,6 +22,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include <vector>
 #include <string>
 #include <string.h>
+#include <stdint.h>
 #include "../util/io/serializer.h"
 #include "../util/io/input_file.h"
 #include "../data/seed_histogram.h"
@@ -43,8 +44,8 @@ struct ReferenceHeader
 	uint64_t magic_number;
 	uint32_t build, db_version;
 	uint64_t sequences, letters, pos_array_offset;
-	static const uint64_t MAGIC_NUMBER = 0x24af8a415ee186dllu;
 	enum { current_db_version = 2 };
+	static constexpr uint64_t MAGIC_NUMBER = 0x24af8a415ee186dllu;
 };
 
 struct ReferenceHeader2
@@ -76,6 +77,7 @@ struct DatabaseFile : public InputFile
 	DatabaseFile(TempFile &tmp_file);
 	static void read_header(InputFile &stream, ReferenceHeader &header);
 	static DatabaseFile* auto_create_from_fasta();
+	static bool is_diamond_db(const string &file_name);
 	void rewind();
 	bool load_seqs(vector<unsigned> &block_to_database_id, size_t max_letters, bool masked, Sequence_set **dst_seq, String_set<0> **dst_id, bool load_ids = true, const vector<bool> *filter = NULL);
 	void get_seq();


=====================================
src/dp/score_vector.h
=====================================
@@ -325,7 +325,10 @@ struct score_vector<int16_t>
 
 template<typename _t>
 struct ScoreTraits
-{};
+{
+	typedef void Score;
+	enum { CHANNELS = 0 };
+};
 
 template<>
 struct ScoreTraits<int32_t>


=====================================
src/output/blast_tab_format.cpp
=====================================
@@ -16,12 +16,16 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
+#include <sstream>
 #include <set>
+#include <numeric>
 #include "../basic/match.h"
 #include "output_format.h"
 #include "../data/taxonomy.h"
 #include "../data/queries.h"
 
+using namespace std;
+
 const char* Blast_tab_format::field_str[] = {
 	"qseqid",		// 0 means Query Seq - id
 	"qgi",			// 1 means Query GI
@@ -75,7 +79,67 @@ const char* Blast_tab_format::field_str[] = {
 	"qqual",		// 49
 	"qnum",			// 50
 	"snum",			// 51
-	"scovhsp"		// 52
+	"scovhsp",		// 52
+	"full_qqual",	// 53
+	"full_qseq"		// 54
+};
+
+const char* Blast_tab_format::field_desc[] = {
+	"Query ID",		// 0 means Query Seq - id
+	"qgi",			// 1 means Query GI
+	"qacc",			// 2 means Query accesion
+	"qaccver",		// 3 means Query accesion.version
+	"Query length",	// 4 means Query sequence length
+	"Subject ID",		// 5 means Subject Seq - id
+	"Subject IDs",	// 6 means All subject Seq - id(s), separated by a ';'
+	"sgi",			// 7 means Subject GI
+	"sallgi",		// 8 means All subject GIs
+	"sacc",			// 9 means Subject accession
+	"saccver",		// 10 means Subject accession.version
+	"sallacc",		// 11 means All subject accessions
+	"Subject length",			// 12 means Subject sequence length
+	"Start of alignment in query",		// 13 means Start of alignment in query
+	"End of alignment in query",			// 14 means End of alignment in query
+	"Start of alignment in subject",		// 15 means Start of alignment in subject
+	"End of alignment in subject",			// 16 means End of alignment in subject
+	"Aligned part of query sequence",			// 17 means Aligned part of query sequence
+	"Aligned part of subject sequence",			// 18 means Aligned part of subject sequence
+	"Expected value",		// 19 means Expect value
+	"Bit score",		// 20 means Bit score
+	"Raw score",		// 21 means Raw score
+	"Alignment length",		// 22 means Alignment length
+	"Percentage of identical matches",		// 23 means Percentage of identical matches
+	"Number of identical matches",		// 24 means Number of identical matches
+	"Number of mismatches",		// 25 means Number of mismatches
+	"Number of positive-scoring matches",		// 26 means Number of positive - scoring matches
+	"Number of gap openings",		// 27 means Number of gap openings
+	"Total number of gaps",			// 28 means Total number of gaps
+	"Percentage of positive-scoring matches",			// 29 means Percentage of positive - scoring matches
+	"frames",		// 30 means Query and subject frames separated by a '/'
+	"Query frame",		// 31 means Query frame
+	"sframe",		// 32 means Subject frame
+	"Blast traceback operations",			// 33 means Blast traceback operations(BTOP)
+	"Subject Taxonomy IDs",		// 34 means unique Subject Taxonomy ID(s), separated by a ';'	(in numerical order)
+	"sscinames",	// 35 means unique Subject Scientific Name(s), separated by a ';'
+	"scomnames",	// 36 means unique Subject Common Name(s), separated by a ';'
+	"sblastnames",	// 37 means unique Subject Blast Name(s), separated by a ';'	(in alphabetical order)
+	"sskingdoms",	// 38 means unique Subject Super Kingdom(s), separated by a ';'	(in alphabetical order)
+	"Subject title",		// 39 means Subject Title
+	"Subject titles",	// 40 means All Subject Title(s), separated by a '<>'
+	"sstrand",		// 41 means Subject Strand
+	"qcovs",		// 42 means Query Coverage Per Subject
+	"Query coverage per HSP",		// 43 means Query Coverage Per HSP
+	"qcovus",		// 44 means Query Coverage Per Unique Subject(blastn only)
+	"Query title",		// 45 means Query title
+	"swdiff",		// 46
+	"time", 		// 47
+	"Subject sequence",	// 48
+	"Aligned part of query quality values",		// 49
+	"qnum",			// 50
+	"snum",			// 51
+	"scovhsp",		// 52
+	"Query quality values",	// 53
+	"Query sequence"		// 54
 };
 
 Blast_tab_format::Blast_tab_format() :
@@ -98,7 +162,7 @@ Blast_tab_format::Blast_tab_format() :
 			config.salltitles = true;
 		if (j == 48)
 			config.use_lazy_dict = true;
-		if (j == 49)
+		if (j == 49 || j == 53)
 			config.store_query_quality = true;
 	}
 }
@@ -250,7 +314,7 @@ void Blast_tab_format::print_match(const Hsp_context& r, const Metadata &metadat
 			out << r.subject_seq;
 			break;
 		case 49:
-			out << (*query_qual)[r.query_id].c_str();
+			out << (query_qual && (*query_qual)[r.query_id].present() ? (*query_qual)[r.query_id].subseq(r.query_source_range().begin_, r.query_source_range().end_).c_str() : "*");
 			break;
 		case 50:
 			out << query_block_to_database_id[r.query_id];
@@ -261,6 +325,12 @@ void Blast_tab_format::print_match(const Hsp_context& r, const Metadata &metadat
 		case 52:
 			out << (double)r.subject_range().length() * 100.0 / r.subject_len;
 			break;
+		case 53:
+			out << (query_qual && (*query_qual)[r.query_id].present() ? (*query_qual)[r.query_id].c_str() : "*");
+			break;
+		case 54:
+			r.query.source().print(out, input_value_traits);
+			break;
 		default:
 			throw std::runtime_error(string("Invalid output field: ") + field_str[*i]);
 		}
@@ -289,6 +359,7 @@ void Blast_tab_format::print_query_intro(size_t query_num, const char *query_nam
 			case 39:
 			case 40:
 			case 48:
+			case 49:
 				out << '*';
 				break;
 			case 12:
@@ -317,8 +388,11 @@ void Blast_tab_format::print_query_intro(size_t query_num, const char *query_nam
 			case 45:
 				out << query_name;
 				break;
-			case 49:
-				out << (*query_qual)[query_num].c_str();
+			case 53:
+				out << (query_qual && (*query_qual)[query_num].present() ? (*query_qual)[query_num].c_str() : "*");
+				break;
+			case 54:
+				(align_mode.query_translated ? query_source_seqs::get()[query_num] : query_seqs::get()[query_num]).print(out, input_value_traits);
 				break;
 			default:
 				throw std::runtime_error(string("Invalid output field: ") + field_str[*i]);
@@ -328,4 +402,15 @@ void Blast_tab_format::print_query_intro(size_t query_num, const char *query_nam
 		}
 		out << '\n';
 	}
+}
+
+void Blast_tab_format::print_header(Consumer &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const {
+	if (!config.output_header)
+		return;
+	stringstream ss;
+	ss << "# DIAMOND v" << Const::version_string << ". http://github.com/bbuchfink/diamond" << endl;
+	ss << "# Invocation: " << config.invocation << endl;
+	ss << "# Fields: " << join(", ", apply(fields, [](unsigned i) -> string { return string(field_desc[i]); })) << endl;
+	const string s(ss.str());
+	f.consume(s.data(), s.length());
 }
\ No newline at end of file


=====================================
src/output/output_format.h
=====================================
@@ -87,13 +87,14 @@ struct DAA_format : public Output_format
 
 struct Blast_tab_format : public Output_format
 {
-	static const char* field_str[];
+	static const char* field_str[], *field_desc[];
 	Blast_tab_format();
-	virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const;
-	virtual void print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out);
+	virtual void print_header(Consumer &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const override;
+	virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const override;
+	virtual void print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out) override;
 	virtual ~Blast_tab_format()
 	{ }
-	virtual Output_format* clone() const
+	virtual Output_format* clone() const override
 	{
 		return new Blast_tab_format(*this);
 	}


=====================================
src/output/target_culling.h
=====================================
@@ -30,6 +30,7 @@ struct TargetCulling
 	virtual int cull(const vector<IntermediateRecord> &target_hsp) const = 0;
 	virtual void add(const Target &t) = 0;
 	virtual void add(const vector<IntermediateRecord> &target_hsp) = 0;
+	virtual ~TargetCulling() = default;
 	enum { FINISHED = 0, NEXT = 1, INCLUDE = 2};
 	static TargetCulling* get();
 };
@@ -70,6 +71,7 @@ struct GlobalCulling : public TargetCulling
 			top_score_ = target_hsp[0].score;
 		++n_;
 	}
+	virtual ~GlobalCulling() = default;
 private:
 	size_t n_;
 	int top_score_;
@@ -119,6 +121,7 @@ struct RangeCulling : public TargetCulling
 		for (std::vector<IntermediateRecord>::const_iterator i = target_hsp.begin(); i != target_hsp.end(); ++i)
 			p_.insert(i->absolute_query_range(), i->score);
 	}
+	virtual ~RangeCulling() = default;
 private:
 	IntervalPartition p_;
 };


=====================================
src/run/main.cpp
=====================================
@@ -42,11 +42,11 @@ void info();
 void seed_stat();
 void pairwise();
 void protein_snps();
+void translate();
 
 int main(int ac, const char* av[])
 {
 	try {
-
 		check_simd();
 		config = Config(ac, av);
 
@@ -126,6 +126,9 @@ int main(int ac, const char* av[])
 		case Config::cluster:
 			Workflow::Cluster::run();
 			break;
+		case Config::translate:
+			translate();
+			break;
 #ifdef EXTRA
 		case Config::test_extra:
 			test_main();


=====================================
src/run/tools.cpp
=====================================
@@ -360,3 +360,16 @@ void protein_snps()
 	}
 	call_protein_snps(current_gene, ref_genes[current_gene], snps, dataset);
 }
+
+void translate() {
+	input_value_traits = nucleotide_traits;
+	TextInputFile in(config.query_file);
+	vector<char> id, seq;
+	vector<char> proteins[6];
+	while (FASTA_format().get_seq(id, seq, in)) {
+		Translator::translate(seq, proteins);
+		cout << ">" << string(id.data(), id.size()) << endl;
+		cout << sequence(proteins[0]) << endl;
+	}
+	in.close();
+}
\ No newline at end of file


=====================================
src/util/command_line_parser.cpp
=====================================
@@ -18,6 +18,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 #include <iostream>
 #include <algorithm>
+#include <string.h>
 #include "command_line_parser.h"
 
 using namespace std;
@@ -87,7 +88,7 @@ void Command_line_parser::store(int count, const char ** str, unsigned &command)
 
 	vector<string> v;
 	for (int i = 2; i < count; ++i) {
-		if (str[i][0] == '-') {
+		if (str[i][0] == '-' && strlen(str[i]) > 1) {
 			store_option(v);
 			v.clear();
 		}


=====================================
src/util/io/file_source.cpp
=====================================
@@ -38,9 +38,9 @@ FileSource::FileSource(const string &file_name) :
 	file_name_(file_name)
 {
 #ifdef _MSC_VER
-	f_ = file_name.empty() ? stdin : fopen(file_name.c_str(), "rb");
+	f_ = (file_name.empty() || file_name == "-") ? stdin : fopen(file_name.c_str(), "rb");
 #else
-	int fd_ = file_name.empty() ? 0 : POSIX_OPEN2(file_name.c_str(), O_RDONLY);
+	int fd_ = (file_name.empty() || file_name == "-") ? 0 : POSIX_OPEN2(file_name.c_str(), O_RDONLY);
 	if (fd_ < 0) {
 		perror(0);
 		throw std::runtime_error(string("Error opening file ") + file_name);


=====================================
src/util/io/input_file.cpp
=====================================
@@ -41,7 +41,7 @@ InputFile::InputFile(const string &file_name, int flags) :
 	Deserializer(new InputStreamBuffer(new FileSource(file_name))),
 	file_name(file_name)
 {
-	if (file_name.empty())
+	if (file_name.empty() || file_name == "-")
 		return;
 #ifndef _MSC_VER
 	struct stat buf;


=====================================
src/util/log_stream.h
=====================================
@@ -37,7 +37,7 @@ struct Message_stream
 	Message_stream& operator<<(const _t& x)
 	{
 		if(to_cout_)
-			std::cout << x;
+			std::cerr << x;
 		if (to_file_) {
 			std::ofstream f("diamond.log", std::ios_base::out | std::ios_base::app);
 			f << x;
@@ -49,7 +49,7 @@ struct Message_stream
 	Message_stream& operator<<(std::ostream & (*_Pfn)(std::ostream&))
 	{
 		if(to_cout_)
-			((*_Pfn)(std::cout));
+			((*_Pfn)(std::cerr));
 		if (to_file_) {
 			mtx.lock();
 			std::ofstream f("diamond.log", std::ios_base::out | std::ios_base::app);


=====================================
src/util/sequence/sequence.cpp
=====================================
@@ -0,0 +1,50 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2018 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include <algorithm>
+#include <stdexcept>
+#include "sequence.h"
+
+using namespace std;
+
+namespace Util { namespace Sequence {
+
+void format(sequence seq, const char *id, const char *qual, OutputFile &out, const std::string &format, const Value_traits &value_traits) {
+	static TextBuffer buf;
+	constexpr size_t WRAP = 160;
+	const size_t l = seq.length();
+	if(format == "fasta") {
+		buf << '>' << id << '\n';
+		for (size_t i = 0; i < seq.length(); i += WRAP) {
+			seq.print(buf, i, min(i + WRAP, seq.length()), value_traits);
+			buf << '\n';
+		}
+	}
+	else if (format == "fastq") {
+		buf << '@' << id << '\n';
+		seq.print(buf, value_traits);
+		buf << "\n+\n";
+		buf << qual << '\n';
+	}
+	else
+		throw runtime_error("Invalid sequence file format");
+	out.write(buf.get_begin(), buf.size());
+	buf.clear();
+}
+
+}}
\ No newline at end of file


=====================================
src/util/sequence/sequence.h
=====================================
@@ -0,0 +1,32 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2018 Benjamin Buchfink <buchfink at gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#ifndef UTIL_SEQUENCE_SEQUENCE_H_
+#define UTIL_SEQUENCE_SEQUENCE_H_
+
+#include <string>
+#include "../../basic/sequence.h"
+#include "../io/output_file.h"
+
+namespace Util { namespace Sequence {
+
+void format(sequence seq, const char *id, const char *qual, OutputFile &out, const std::string &format, const Value_traits &value_traits);
+
+}}
+
+#endif
\ No newline at end of file


=====================================
src/util/system/system.cpp
=====================================
@@ -10,18 +10,16 @@ using namespace std;
 #else
 #include <unistd.h>
 #include <sys/stat.h>
-#include <linux/limits.h>
 #endif
 
 string executable_path() {
-#ifdef _MSC_VER
 	char buf[4096];
+#ifdef _MSC_VER
 	if (GetModuleFileNameA(NULL, buf, sizeof(buf)) == 0)
 		throw runtime_error("Error executing GetModuleFileNameA.");
 	return string(buf);
 #else
-	char buf[PATH_MAX + 1];
-	if (readlink("/proc/self/exe", buf, PATH_MAX) < 0)
+	if (readlink("/proc/self/exe", buf, sizeof(buf)) < 0)
 		throw runtime_error("Error executing readlink on /proc/self/exe.");
 	return string(buf);
 #endif


=====================================
src/util/util.cpp
=====================================
@@ -70,7 +70,7 @@ const EscapeSequence EscapeSequences::xml_data[5] = {
 
 const EscapeSequences EscapeSequences::XML(EscapeSequences::xml_data, 5);
 
-string join(char c, const vector<string> &v) {
+string join(const char *c, const vector<string> &v) {
 	string s;
 	if (v.empty())
 		return s;


=====================================
src/util/util.h
=====================================
@@ -526,6 +526,15 @@ inline set<unsigned> parse_csv(const string &s)
 	return r;
 }
 
-std::string join(char c, const std::vector<std::string> &v);
+std::string join(const char *c, const std::vector<std::string> &v);
+
+template<typename _t, typename _f>
+auto apply(const std::vector<_t> &v, _f f) -> std::vector<typename std::result_of<_f(_t)>::type> {
+	std::vector<typename std::result_of<_f(_t)>::type> r;
+	r.reserve(v.size());
+	for (const auto &i : v)
+		r.push_back(f(i));
+	return r;
+}
 
 #endif /* UTIL_H_ */



View it on GitLab: https://salsa.debian.org/med-team/diamond-aligner/commit/0238729ea8a318b983987b3e4ad3b030db5d4d8a

-- 
View it on GitLab: https://salsa.debian.org/med-team/diamond-aligner/commit/0238729ea8a318b983987b3e4ad3b030db5d4d8a
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20181229/e60b3793/attachment-0001.html>