[med-svn] [Git][med-team/libmaus2][upstream] New upstream version 2.0.810+ds

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Thu Feb 24 20:12:12 GMT 2022



Étienne Mollier pushed to branch upstream at Debian Med / libmaus2


Commits:
f487c237 by Étienne Mollier at 2022-02-24T20:33:22+01:00
New upstream version 2.0.810+ds
- - - - -


10 changed files:

- ChangeLog
- configure.ac
- src/libmaus2/suffixsort/BwtMergeBlockSortResult.hpp
- src/libmaus2/suffixsort/BwtMergeTempFileNameSet.hpp
- src/libmaus2/suffixsort/bwtb3m/BwtMergeSortTemplate.hpp
- src/libmaus2/suffixsort/bwtb3m/MergeStrategyBaseBlock.hpp
- src/libmaus2/suffixsort/bwtb3m/MergeStrategyBlock.hpp
- src/libmaus2/suffixsort/bwtb3m/MergeStrategyMergeBlock.hpp
- src/libmaus2/suffixsort/bwtb3m/MergeStrategyMergeGapRequest.hpp
- src/libmaus2/suffixsort/bwtb3m/MergeTree.hpp


Changes:

=====================================
ChangeLog
=====================================
@@ -1,3 +1,9 @@
+libmaus2 (2.0.810-1) unstable; urgency=medium
+
+  *  Apply some improvements in BWT construction
+
+ -- German Tischler-Höhle <germant at miltenyibiotec.de>  Wed, 16 Feb 2022 14:51:33 +0100
+
 libmaus2 (2.0.809-1) unstable; urgency=medium
 
   * Improve cleanup in BwtMergeSortTemplate


=====================================
configure.ac
=====================================
@@ -1,5 +1,5 @@
-AC_INIT(libmaus2,2.0.809,[germant at miltenyibiotec.de],[libmaus2],[https://gitlab.com/german.tischler/libmaus2])
-LIBRARY_VERSION=2:809:0
+AC_INIT(libmaus2,2.0.810,[germant at miltenyibiotec.de],[libmaus2],[https://gitlab.com/german.tischler/libmaus2])
+LIBRARY_VERSION=2:810:0
 AC_MSG_NOTICE([Configuring for source in directory ${srcdir}])
 AC_CANONICAL_SYSTEM
 AC_CANONICAL_HOST


=====================================
src/libmaus2/suffixsort/BwtMergeBlockSortResult.hpp
=====================================
@@ -99,6 +99,10 @@ namespace libmaus2
 			void removeFilesButBwtAndGt() const { files.removeFilesButBwtAndGt(); }
 			void setTempPrefixAndRegisterAsTemp(libmaus2::util::TempFileNameGenerator & gtmpgen, uint64_t const numbwt, uint64_t const numgt, uint64_t const numisa)
 			{ files.setPrefixAndRegisterAsTemp(gtmpgen,numbwt,numgt,numisa); }
+			void setTempPrefixSingleAndRegisterAsTemp(libmaus2::util::TempFileNameGenerator & gtmpgen)
+			{ files.setPrefixSingleAndRegisterAsTemp(gtmpgen); }
+			void setTempPrefixSingle(std::string const & dir_name)
+			{ files.setTempPrefixSingle(dir_name); }
 			libmaus2::autoarray::AutoArray < ::libmaus2::suffixsort::BwtMergeZBlock > const & getZBlocks() const { return zblocks; }
 			void resizeZBlocks(uint64_t const n) { zblocks.resize(n); }
 			void setZBlock(uint64_t const i, ::libmaus2::suffixsort::BwtMergeZBlock const & z) { zblocks.at(i) = z; }


=====================================
src/libmaus2/suffixsort/BwtMergeTempFileNameSet.hpp
=====================================
@@ -165,10 +165,58 @@ namespace libmaus2
 			void setSampledISA(std::string const & rsampledisa) { sampledisa = std::vector<std::string>(1,rsampledisa); }
 			void setSampledISA(std::vector<std::string> const & rsampledisa) { sampledisa = rsampledisa; }
 
+			void setTempPrefixSingle(std::string const & dir_name) {
+				std::filesystem::path const p_dir_name(dir_name);
+
+				std::filesystem::path const p_hwt_req(p_dir_name / "block.hwtreq");
+				std::filesystem::path const p_hwt(p_dir_name / "block.hwt");
+				std::filesystem::path const p_hist(p_dir_name / "block.hist");
+				std::filesystem::path const p_hist_freq(p_dir_name / "block.histfreq");
+
+				setHWTReq  (p_hwt_req.string());
+				setHWT     (p_hwt.string());
+				setHist    (p_hist.string());
+				setHistFreq(p_hist_freq.string());
+			}
+
+			void setPrefixSingle(libmaus2::util::TempFileNameGenerator & gtmpgen)
+			{
+				/**
+				 * number of files: 4
+				 **/
+				std::string const thwtreq = gtmpgen.getFileName()+".hwtreq";
+				setHWTReq(thwtreq);
+				std::string const thwt = gtmpgen.getFileName()+".hwt";
+				setHWT(thwt);
+				std::string const thist = gtmpgen.getFileName()+".hist";
+				setHist(thist);
+				std::string const thistfreq = gtmpgen.getFileName()+".histfreq";
+				setHistFreq(thistfreq);
+			}
+
+			void registerAsTempSingle()
+			{
+				::libmaus2::util::TempFileRemovalContainer::addTempFile(getHWT());
+				::libmaus2::util::TempFileRemovalContainer::addTempFile(getHWTReq());
+				::libmaus2::util::TempFileRemovalContainer::addTempFile(getHist());
+				::libmaus2::util::TempFileRemovalContainer::addTempFile(getHistFreq());
+			}
+
+			void setPrefixSingleAndRegisterAsTemp(libmaus2::util::TempFileNameGenerator & gtmpgen)
+			{
+				setPrefixSingle(gtmpgen);
+				registerAsTempSingle();
+			}
+
 			void setPrefix(libmaus2::util::TempFileNameGenerator & gtmpgen, uint64_t const numbwt, uint64_t const numgt, uint64_t const numisa)
 			{
+				/**
+				 * number of files: 4 + numgt + numbwt + numisa
+				 **/
+				setPrefixSingle(gtmpgen);
+
 				std::vector<std::string> gtfilenames(numgt);
-				for ( uint64_t i = 0; i < numbwt; ++i )
+				for ( uint64_t i = 0; i < numgt; ++i )
 				{
 					std::ostringstream ostr;
 					ostr << gtmpgen.getFileName() << '_'
@@ -187,16 +235,7 @@ namespace libmaus2
 						<< ".bwt";
 					bwtfilenames[i] = ostr.str();
 				}
-
 				setBWT(bwtfilenames);
-				std::string const thwtreq = gtmpgen.getFileName()+".hwtreq";
-				setHWTReq(thwtreq);
-				std::string const thwt = gtmpgen.getFileName()+".hwt";
-				setHWT(thwt);
-				std::string const thist = gtmpgen.getFileName()+".hist";
-				setHist(thist);
-				std::string const thistfreq = gtmpgen.getFileName()+".histfreq";
-				setHist(thistfreq);
 
 				std::vector<std::string> isafilenames(numisa);
 				for ( uint64_t i = 0; i < numisa; ++i )
@@ -210,15 +249,18 @@ namespace libmaus2
 				setSampledISA(isafilenames);
 			}
 
+
 			void setPrefixAndRegisterAsTemp(libmaus2::util::TempFileNameGenerator & gtmpgen, uint64_t const numbwt, uint64_t const numgt, uint64_t const numisa)
 			{
+
 				setPrefix(gtmpgen, numbwt, numgt, numisa);
+
+				registerAsTempSingle();
+
 				for ( uint64_t i = 0; i < getGT().size(); ++i )
 					::libmaus2::util::TempFileRemovalContainer::addTempFile(getGT()[i]);
 				for ( uint64_t i = 0; i < getBWT().size(); ++i )
 					::libmaus2::util::TempFileRemovalContainer::addTempFile(getBWT()[i]);
-				::libmaus2::util::TempFileRemovalContainer::addTempFile(getHWT());
-				::libmaus2::util::TempFileRemovalContainer::addTempFile(getHist());
 				for ( uint64_t i = 0; i < getSampledISAVector().size(); ++i )
 					::libmaus2::util::TempFileRemovalContainer::addTempFile(getSampledISAVector()[i]);
 			}
@@ -311,7 +353,7 @@ namespace libmaus2
 				uint64_t const numgtfiles
 			)
 			{
-				return numgtfiles + numbwtfiles + 1 + 1 + 1 + 1;
+				return numgtfiles + numbwtfiles + 1 /* hwtreq */ + 1 /* hwt */ + 1 /* hist */ + 1 /* hist.freq */ + 1 /* sample isa */;
 
 			}
 


=====================================
src/libmaus2/suffixsort/bwtb3m/BwtMergeSortTemplate.hpp
=====================================
@@ -1,6 +1,6 @@
 /**
     libmaus2
-    Copyright (C) 2009-2021 German Tischler-Höhle
+    Copyright (C) 2009-2022 German Tischler-Höhle
     Copyright (C) 2011-2014 Genome Research Limited
 
     This program is free software: you can redistribute it and/or modify
@@ -108,13 +108,19 @@ namespace libmaus2
 				// array of computed LCP values between block and start of next block
 				std::shared_ptr< libmaus2::util::AtomicArray<uint64_t> > V_boundedlcpblockvalues;
 
+				// base tmp directory name
 				std::string tmpdirname;
+				// tmpdirname as path object
 				std::filesystem::path tmppath;
-				std::filesystem::path ds_tmp_path;
 				std::filesystem::path base_tmp_path;
 				std::filesystem::path ds_tmp_path_base_ds_tmp;
+				std::filesystem::path merge_tmp_path;
+				std::filesystem::path ds_tmp_path_merge_ds_tmp;
 
 				std::unique_ptr<libmaus2::util::DirectoryStructure> DSbase;
+				std::unique_ptr<libmaus2::util::DirectoryStructure> DSmerge;
+				std::map < MergeStrategyBlock *, std::string > M_merge_dirs;
+				std::map < MergeStrategyBlock *, std::size_t> M_merge_gt_expected;
 
 				std::unique_ptr< ::libmaus2::suffixsort::BwtMergeTempFileNameSetVector > blocktmpnames;
 
@@ -125,6 +131,8 @@ namespace libmaus2
 
 				libmaus2::huffman::HuffmanTree::unique_ptr_type uhnode;
 
+				std::shared_ptr<libmaus2::suffixsort::bwtb3m::MergeTree> merge_tree;
+
 				static void serialiseNumber(std::ostream & ostr, uint64_t const n) {
 					libmaus2::util::NumberSerialisation::serialiseNumber(ostr,n);
 				}
@@ -154,14 +162,23 @@ namespace libmaus2
 
 					libmaus2::util::StringSerialisation::serialiseString(ostr,tmpdirname);
 					libmaus2::util::StringSerialisation::serialiseString(ostr,tmppath.string());
-					libmaus2::util::StringSerialisation::serialiseString(ostr,ds_tmp_path.string());
+
 					libmaus2::util::StringSerialisation::serialiseString(ostr,base_tmp_path.string());
 					libmaus2::util::StringSerialisation::serialiseString(ostr,ds_tmp_path_base_ds_tmp.string());
 
+					libmaus2::util::StringSerialisation::serialiseString(ostr,merge_tmp_path.string());
+					libmaus2::util::StringSerialisation::serialiseString(ostr,ds_tmp_path_merge_ds_tmp.string());
+
 					libmaus2::util::NumberMapSerialisation::serialiseMap<std::ostream,int64_t,uint64_t>(ostr,chistnoterm);
 					libmaus2::util::NumberMapSerialisation::serialiseMap<std::ostream,int64_t,uint64_t>(ostr,chist);
 					libmaus2::util::NumberSerialisation::serialiseSignedNumber(ostr,bwtterm);
 					libmaus2::util::NumberSerialisation::serialiseNumber(ostr,maxsym);
+
+					if ( merge_tree )
+					{
+						libmaus2::util::NumberSerialisation::serialiseNumber(ostr,1);
+						merge_tree->serialise(ostr);
+					}
 				}
 
 				void deserialise(std::istream & istr)
@@ -191,10 +208,13 @@ namespace libmaus2
 
 					tmpdirname = libmaus2::util::StringSerialisation::deserialiseString(istr);
 					tmppath = libmaus2::util::StringSerialisation::deserialiseString(istr);
-					ds_tmp_path = libmaus2::util::StringSerialisation::deserialiseString(istr);
+
 					base_tmp_path = libmaus2::util::StringSerialisation::deserialiseString(istr);
 					ds_tmp_path_base_ds_tmp = libmaus2::util::StringSerialisation::deserialiseString(istr);
 
+					merge_tmp_path = libmaus2::util::StringSerialisation::deserialiseString(istr);
+					ds_tmp_path_merge_ds_tmp = libmaus2::util::StringSerialisation::deserialiseString(istr);
+
 					std::unique_ptr<libmaus2::util::DirectoryStructure> t_DSbase(
 						BwtMergeTempFileNameSetVector::getDirectoryStructure(
 							ds_tmp_path_base_ds_tmp.string(),
@@ -221,6 +241,18 @@ namespace libmaus2
 
 					if ( chist.size() )
 						computeHuffmanTree();
+
+					bool const have_merge_tree = libmaus2::util::NumberSerialisation::deserialiseNumber(istr) != 0;
+
+					if ( have_merge_tree )
+					{
+						std::shared_ptr<libmaus2::suffixsort::bwtb3m::MergeTree> t_merge_tree(
+							new libmaus2::suffixsort::bwtb3m::MergeTree(istr)
+						);
+						merge_tree = t_merge_tree;
+
+						setupMergeDirectoryStructure();
+					}
 				}
 
 				static uint64_t getFileSize(
@@ -325,11 +357,14 @@ namespace libmaus2
 					blocksizeprevtwo((blocksize == blocksizenexttwo) ? blocksize : (blocksizenexttwo / 2)),
 					preisasamplingrate(std::min(options.maxpreisasamplingrate,blocksizeprevtwo)),
 					V_boundedlcpblockvalues(new libmaus2::util::AtomicArray<uint64_t>(numblocks,0)),
+					// tmp directory name
 					tmpdirname(ensureDirectory(options.tmpfilenamebase + "_tmpdir")),
+					// path object for tmp directory name
 					tmppath(tmpdirname),
-					ds_tmp_path(ensureDirectory(tmppath / "ds_tmp")),
 					base_tmp_path(ensureDirectory(tmppath / "base_tmp")),
 					ds_tmp_path_base_ds_tmp(tmppath / "base_ds.tmp"),
+					merge_tmp_path(ensureDirectory(tmppath / "merge_tmp")),
+					ds_tmp_path_merge_ds_tmp(tmppath / "merge_ds.tmp"),
 					DSbase(
 						BwtMergeTempFileNameSetVector::getDirectoryStructure(
 							ds_tmp_path_base_ds_tmp.string(),
@@ -340,6 +375,7 @@ namespace libmaus2
 							options.numthreads /* gt */
 						)
 					),
+					DSmerge(),
 					blocktmpnames(
 						new ::libmaus2::suffixsort::BwtMergeTempFileNameSetVector(*DSbase, getNumBlocks(), options.numthreads /* bwt */, options.numthreads /* gt */)
 					),
@@ -356,12 +392,6 @@ namespace libmaus2
 					// there should be at least one block as input size is not zero
 					assert ( numblocks );
 
-					#if 0
-					libmaus2::aio::OutputStreamFactoryContainer::mkdirp(tmpdirname,0700);
-					libmaus2::aio::OutputStreamFactoryContainer::mkdirp(ds_tmp_path.string(),0700);
-					libmaus2::aio::OutputStreamFactoryContainer::mkdirp(base_tmp_path.string(),0700);
-					#endif
-
 					DSbase->doGenerate();
 
 					// std::cerr << "blocktmpnames=\n" << blocktmpnames->toString();
@@ -376,9 +406,17 @@ namespace libmaus2
 
 				void cleanup()
 				{
+					if ( DSmerge ) {
+						for ( auto const & P : M_merge_dirs )
+							libmaus2::aio::OutputStreamFactoryContainer::rmdir(P.second);
+						DSmerge->doRemove();
+					}
+					else
+						libmaus2::aio::OutputStreamFactoryContainer::rmdir(merge_tmp_path);
+
 					DSbase->doRemove();
 					libmaus2::aio::FileRemoval::removeFile(ds_tmp_path_base_ds_tmp.string());
-					libmaus2::aio::OutputStreamFactoryContainer::rmdir(ds_tmp_path.string());
+					libmaus2::aio::FileRemoval::removeFile(ds_tmp_path_merge_ds_tmp.string());
 					libmaus2::aio::OutputStreamFactoryContainer::rmdir(tmpdirname);
 				}
 
@@ -530,6 +568,99 @@ namespace libmaus2
 					huftreeCOS->flush();
 					huftreeCOS.reset();
 				}
+
+				std::size_t getMergeGTExpected(MergeStrategyBlock * p) const
+				{
+					auto it = M_merge_gt_expected.find(p);
+
+					if ( it == M_merge_gt_expected.end() ) {
+						libmaus2::exception::LibMausException lme;
+						lme.getStream() << "BwtMergeState::getMergeGTExpected: unable to find node" << std::endl;
+						lme.finish();
+						throw lme;
+					}
+
+					return it->second;
+				}
+
+				std::string getDirectoryForMerge(MergeStrategyBlock * p) const
+				{
+					auto it = M_merge_dirs.find(p);
+
+					if ( it == M_merge_dirs.end() ) {
+						libmaus2::exception::LibMausException lme;
+						lme.getStream() << "BwtMergeState::getDirectoryForMerge: unable to find node" << std::endl;
+						lme.finish();
+						throw lme;
+					}
+
+					return it->second;
+				}
+
+				void fillGTExpected()
+				{
+					std::vector < MergeStrategyMergeInternalBlock * > V_internal;
+					std::vector < MergeStrategyMergeInternalSmallBlock * > V_internal_small;
+					std::vector < MergeStrategyMergeExternalBlock * > V_external;
+
+					merge_tree->getMergeBlocks(V_internal,V_internal_small,V_external);
+
+					for ( auto * p : V_internal )
+						M_merge_gt_expected[p] = p->getNumGtTempFilesRequired();
+					for ( auto * p : V_internal_small )
+						M_merge_gt_expected[p] = p->getNumGtTempFilesRequired();
+					for ( auto * p : V_external )
+						M_merge_gt_expected[p] = p->getNumGtTempFilesRequired();
+				}
+
+				void setupMergeDirectoryStructure()
+				{
+					std::vector < MergeStrategyMergeInternalBlock * > V_internal;
+					std::vector < MergeStrategyMergeInternalSmallBlock * > V_internal_small;
+					std::vector < MergeStrategyMergeExternalBlock * > V_external;
+
+					merge_tree->getMergeBlocks(V_internal,V_internal_small,V_external);
+
+					std::size_t const num_merge_nodes = V_internal.size() + V_internal_small.size() + V_external.size();
+
+					std::unique_ptr<libmaus2::util::DirectoryStructure> tDS(
+						new libmaus2::util::DirectoryStructure(
+							ds_tmp_path_merge_ds_tmp.string(),
+							64 /* mod */,
+							num_merge_nodes,
+							merge_tmp_path.string()
+						)
+					);
+
+					DSmerge = std::move(tDS);
+					DSmerge->setAsciiFlag(true);
+
+					std::size_t z = 0;
+					for ( auto * p : V_internal )
+						M_merge_dirs[p] = (*DSmerge)[z++];
+					for ( auto * p : V_internal_small )
+						M_merge_dirs[p] = (*DSmerge)[z++];
+					for ( auto * p : V_external )
+						M_merge_dirs[p] = (*DSmerge)[z++];
+				}
+
+				void setMergeTree(std::shared_ptr<libmaus2::suffixsort::bwtb3m::MergeTree> r_merge_tree)
+				{
+					merge_tree = r_merge_tree;
+
+					std::vector < MergeStrategyMergeInternalBlock * > V_internal;
+					std::vector < MergeStrategyMergeInternalSmallBlock * > V_internal_small;
+					std::vector < MergeStrategyMergeExternalBlock * > V_external;
+
+					merge_tree->getMergeBlocks(V_internal,V_internal_small,V_external);
+
+					setupMergeDirectoryStructure();
+					DSmerge->doGenerate();
+					for ( auto & P : M_merge_dirs ) {
+						libmaus2::aio::OutputStreamFactoryContainer::mkdir(P.second,0700);
+						P.first->sortresult.setTempPrefixSingle(P.second);
+					}
+				}
 			};
 		}
 	}
@@ -824,6 +955,13 @@ namespace libmaus2
 					int const verbose
 				)
 				{
+					std::vector < std::string > gtpartnames(zblocks.size());
+					for ( std::size_t z = 0; z < gtpartnames.size(); ++z ) {
+						std::string const gtpartname = gtmpgen.getFileName() + "_" + ::libmaus2::util::NumberSerialisation::formatNumber(z,4) + ".gt";
+						::libmaus2::util::TempFileRemovalContainer::addTempFile(gtpartname);
+						gtpartnames[z] = gtpartname;
+					}
+
 					// gap array
 					uint64_t const Gsize = cblocksize+1;
 
@@ -903,7 +1041,6 @@ namespace libmaus2
 						zabsblockpos[z] = zblocks[z].getZAbsPos();
 					zabsblockpos [ zactive ] = blockstart + cblocksize;
 
-					std::vector < std::string > gtpartnames(zactive);
 
 					if ( verbose >= 5 && logstr )
 					{
@@ -921,9 +1058,8 @@ namespace libmaus2
 
 						::libmaus2::suffixsort::BwtMergeZBlock const & zblock = zblocks[z];
 
-						std::string const gtpartname = gtmpgen.getFileName() + "_" + ::libmaus2::util::NumberSerialisation::formatNumber(z,4) + ".gt";
-						::libmaus2::util::TempFileRemovalContainer::addTempFile(gtpartname);
-						gtpartnames[z] = gtpartname;
+						std::string const gtpartname = gtpartnames.at(z);
+
 						#if 0
 						::libmaus2::huffman::HuffmanEncoderFileStd GTHEF(gtpartname);
 						#endif
@@ -962,7 +1098,7 @@ namespace libmaus2
 					libmaus2::parallel::StdSpinLock gslock;
 
 					#if defined(_OPENMP)
-					#pragma omp parallel for
+					#pragma omp parallel for num_threads(numthreads)
 					#endif
 					for ( uint64_t t = 0; t < cblocks; ++t )
 					{
@@ -1000,7 +1136,6 @@ namespace libmaus2
 					uint64_t const fs, // length of text file in symbols
 					libmaus2::suffixsort::bwtb3m::MergeStrategyMergeGapRequest const & msmgr, // merge request
 					std::vector<std::string> const & mergedgtname, // previous gt file name
-					//std::string const & newmergedgtname, // new gt file name
 					::libmaus2::lf::DArray * const accD, // accumulated symbol freqs for block
 					uint64_t const numthreads,
 					std::ostream * logstr,
@@ -1749,7 +1884,6 @@ namespace libmaus2
 					int const verbose
 				)
 				{
-
 					if ( logstr )
 						(*logstr) << "[V] Merging BWT blocks MergeStrategyMergeInternalBlock." << std::endl;
 
@@ -1785,7 +1919,7 @@ namespace libmaus2
 					// gt bit array,
 					// huffman shaped wavelet tree and
 					// histogram
-					result.setTempPrefixAndRegisterAsTemp(gtmpgen,0 /* no preset bwt file names */, 0 /* no preset gt file names */, 0 /* no preset isa */);
+					// result.setTempPrefixSingleAndRegisterAsTemp(gtmpgen);
 
 					if ( verbose >= 5 && logstr )
 					{
@@ -1855,6 +1989,7 @@ namespace libmaus2
 							(*logstr) << "[V] renaming gt files" << std::endl;
 						}
 
+						// move gt files for left block to output by renaming them
 						std::vector<std::string> oldgtnames;
 						for ( uint64_t i = 0; i < blockresults.getFiles().getGT().size(); ++i )
 						{
@@ -1866,6 +2001,7 @@ namespace libmaus2
 							libmaus2::aio::OutputStreamFactoryContainer::rename(blockresults.getFiles().getGT()[i].c_str(), renamed.c_str());
 						}
 
+						// concatenate new gt files and old ones to obtain output gt files
 						result.setGT(stringVectorAppend(GACR.gtpartnames,oldgtnames));
 
 						::libmaus2::timing::RealTimeClock rtc; rtc.start();
@@ -1931,184 +2067,6 @@ namespace libmaus2
 
 						rtc.start();
 
-						#if 0
-						if ( verbose >= 5 && logstr )
-						{
-							(*logstr) << "[V] computing work packets" << std::endl;
-						}
-
-						uint64_t const logG = std::max(libmaus2::math::ilog(Gsize),static_cast<unsigned int>(1));
-						uint64_t const logG2 = logG*logG;
-						// target number of G samples
-						uint64_t const tnumGsamp = std::max(Gsize / logG2,static_cast<uint64_t>(256*numthreads));
-						uint64_t const Gsampleblocksize = (Gsize + tnumGsamp - 1) / tnumGsamp;
-						// number of G samples
-						uint64_t const numGsamp = (Gsize + Gsampleblocksize - 1) / Gsampleblocksize;
-
-						libmaus2::autoarray::AutoArray < uint64_t > Gsamples(numGsamp,false);
-
-						#if defined(_OPENMP)
-						#pragma omp parallel for num_threads(numthreads)
-						#endif
-						for ( uint64_t t = 0; t < numGsamp; ++t )
-						{
-							uint64_t const low = t * Gsampleblocksize;
-							uint64_t const high = std::min(low + Gsampleblocksize, Gsize);
-							assert ( high >= low );
-							uint64_t s = 0;
-							G_array_iterator gp = GACR.G.begin() + low;
-							for ( uint64_t i = low; i < high; ++i )
-								s += *(gp++);
-							s += (high-low);
-							if ( high == Gsize && high != low )
-								s -= 1;
-							Gsamples[t] = s;
-						}
-
-						#if 0
-						std::vector<uint64_t> G_A(Gsamples.begin(),Gsamples.end());
-						std::vector<uint64_t> G_B(Gsamples.begin(),Gsamples.end());
-
-						libmaus2::util::PrefixSums::prefixSums(G_A.begin(),G_A.end());
-						libmaus2::util::PrefixSums::parallelPrefixSums(G_A.begin(),G_A.end(),numthreads);
-						#endif
-
-						uint64_t const Gsum = libmaus2::util::PrefixSums::parallelPrefixSums(Gsamples.begin(),Gsamples.end(),numthreads);
-
-						if ( verbose >= 5 && logstr )
-						{
-							(*logstr) << "[V] G size " << Gsize << " number of G samples " << numGsamp << std::endl;
-						}
-
-						uint64_t const Gsumperthread = (Gsum + numthreads-1)/numthreads;
-						std::vector < std::pair<uint64_t,uint64_t> > wpacks;
-						wpacks = std::vector < std::pair<uint64_t,uint64_t> >(numthreads);
-						#if defined(_OPENMP)
-						#pragma omp parallel for num_threads(numthreads)
-						#endif
-						for ( uint64_t i = 0; i < numthreads; ++i )
-						{
-							uint64_t const target = i * Gsumperthread;
-							uint64_t const * p = ::std::lower_bound(Gsamples.begin(),Gsamples.end(),target);
-
-							if ( p == Gsamples.end() )
-								--p;
-							while ( *p > target )
-								--p;
-
-							assert ( *p <= target );
-
-							uint64_t iv = (p - Gsamples.begin()) * Gsampleblocksize;
-							uint64_t s = *p;
-
-							G_array_iterator gp = GACR.G.begin() + iv;
-							while ( s < target && iv < Gsize )
-							{
-								s += (*(gp++))+1;
-								iv++;
-							}
-							if ( iv == Gsize )
-								s -= 1;
-
-							wpacks[i].first = iv;
-							if ( i )
-								wpacks[i-1].second = iv;
-							// std::cerr << "i=" << i << " iv=" << iv << " Gsize=" << Gsize << std::endl;
-						}
-						wpacks.back().second = Gsize;
-
-						// remove empty packages
-						{
-							uint64_t o = 0;
-							for ( uint64_t i = 0; i < wpacks.size(); ++i )
-								if ( wpacks[i].first != wpacks[i].second )
-									wpacks[o++] = wpacks[i];
-							wpacks.resize(o);
-						}
-
-						std::vector < uint64_t > P;
-						P.resize(wpacks.size()+1);
-						#if defined(_OPENMP)
-						#pragma omp parallel for num_threads(numthreads)
-						#endif
-						for ( uint64_t i = 0; i < wpacks.size(); ++i )
-						{
-							uint64_t const low = wpacks[i].first;
-							uint64_t const high = wpacks[i].second;
-
-							G_array_iterator gp = GACR.G.begin() + low;
-							uint64_t s = 0;
-							for ( uint64_t i = low; i < high; ++i )
-								s += *(gp++);
-
-							P[i] = s;
-
-						}
-						libmaus2::util::PrefixSums::prefixSums(P.begin(),P.end());
-
-						#if 0
-						// std::cerr << "(computing work packets...";
-						P.push_back(0);
-						uint64_t ilow = 0;
-						//
-						uint64_t const totalsuf = result.getCBlockSize();
-						// number of packets
-						uint64_t const numpack = numthreads;
-						// suffixes per thread
-						uint64_t const tpacksize = (totalsuf + numpack-1)/numpack;
-						while ( ilow != Gsize )
-						{
-							uint64_t s = 0;
-							uint64_t ihigh = ilow;
-
-							if ( verbose >= 5 && logstr )
-							{
-								(*logstr) << "[V] ilow=" << ilow << std::endl;
-							}
-
-							while ( ihigh != Gsize && s < tpacksize )
-								s += (GACR.G[ihigh++]+1);
-
-							uint64_t const p = s-(ihigh-ilow);
-
-							if ( ihigh+1 == Gsize && GACR.G[ihigh] == 0 )
-								ihigh++;
-
-
-							if ( verbose >= 5 && logstr )
-							{
-								(*logstr) << "[V] ihigh=" << ilow << std::endl;
-							}
-
-							// std::cerr << "[" << ilow << "," << ihigh << ")" << std::endl;
-
-							assert ( p == std::accumulate(GACR.G.begin()+ilow,GACR.G.begin()+ihigh,0ull) );
-
-							if ( verbose >= 5 && logstr )
-							{
-								(*logstr) << "[V] accumulate check done" << std::endl;
-							}
-
-							P.push_back(P.back() + p);
-							wpacks.push_back(std::pair<uint64_t,uint64_t>(ilow,ihigh));
-							encfilenames.push_back(
-								gtmpgen.getFileName()
-								// result.getFiles().getBWT()
-								+ "_"
-								+ ::libmaus2::util::NumberSerialisation::formatNumber(encfilenames.size(),6)
-								+ ".bwt"
-							);
-							::libmaus2::util::TempFileRemovalContainer::addTempFile(encfilenames.back());
-							ilow = ihigh;
-
-							if ( verbose >= 5 && logstr )
-							{
-								(*logstr) << "[V] end of single loop" << std::endl;
-							}
-						}
-						#endif
-						#endif
-
 						std::vector < std::string > encfilenames(wpacks.size());
 						for ( uint64_t i = 0; i < wpacks.size(); ++i )
 						{
@@ -2317,7 +2275,7 @@ namespace libmaus2
 								gapfilenames.push_back(newgapname);
 							}
 
-							// bwt name
+							// create new names for the input bwt names
 							std::vector<std::string> newbwtnames;
 							for ( uint64_t i = 0; i < mergereq.children[bb]->sortresult.getFiles().getBWT().size(); ++i )
 							{
@@ -2377,7 +2335,6 @@ namespace libmaus2
 							GapArrayComputationResult const GACR = computeGapArray(
 								gtmpgen,fn,fs,*(mergereq.gaprequests[bx]),
 								mergedgtname,
-								//newmergedgtname,
 								accD.get(),
 								numthreads,
 								logstr,
@@ -2574,7 +2531,7 @@ namespace libmaus2
 					// gt bit array,
 					// huffman shaped wavelet tree and
 					// histogram
-					result.setTempPrefixAndRegisterAsTemp(gtmpgen,0 /* no preset bwt file names */, 0 /* no preset gt file names */, 0 /* no preset isa */);
+					// result.setTempPrefixSingleAndRegisterAsTemp(gtmpgen);
 
 					// if we merge only two blocks together, then we do not need to write the gap array to disk
 					if ( mergereq.children.size() == 2 )
@@ -3161,13 +3118,10 @@ namespace libmaus2
 					result.setCBlockSize ( 0 );
 					for ( uint64_t i = 0; i < mergereq.children.size(); ++i )
 						result.setCBlockSize( result.getCBlockSize() + mergereq.children[i]->sortresult.getCBlockSize() );
-					// set up
-					// filenames of output bwt,
-					// sampled inverse suffix array filename,
-					// gt bit array,
+					// set up file names for
 					// huffman shaped wavelet tree and
 					// histogram
-					result.setTempPrefixAndRegisterAsTemp(gtmpgen,0,0,0);
+					// result.setTempPrefixSingleAndRegisterAsTemp(gtmpgen);
 
 					{
 						std::vector < std::vector < std::string > > gapfilenames;
@@ -3986,6 +3940,8 @@ namespace libmaus2
 						constructMergeTree(logstr,options,state,fn,rlencoderblocksize)
 					);
 
+					state.setMergeTree(mergetree);
+
 					// inner node queue
 					std::deque<uint64_t> itodo;
 
@@ -4033,6 +3989,8 @@ namespace libmaus2
 
 					assert ( mergetree->checkSerialisation() );
 
+					state.fillGTExpected();
+
 					if ( options.verbose >= 5 && logstr )
 						*logstr << "[V] checked serialisation after filling gap request objects" << std::endl;
 
@@ -4072,6 +4030,10 @@ namespace libmaus2
 							(*logstr) << std::endl;
 						}
 
+						std::string const l_merge_tmp_dir = state.getDirectoryForMerge(p);
+
+						// p->sortresult.setTempPrefixSingle(l_merge_tmp_dir);
+
 						#if 0
 						std::ostringstream tmpstr;
 						tmpstr << options.tmpfilenamebase << "_" << std::setfill('0') << std::setw(6) << (mtmpid++);
@@ -4080,6 +4042,8 @@ namespace libmaus2
 						std::ostringstream sparsetmpstr;
 						sparsetmpstr << options.sparsetmpfilenamebase << "_" << std::setfill('0') << std::setw(6) << (mtmpid++);
 
+						// std::size_t const gt_expected = state.getMergeGTExpected(p);
+
 						if ( dynamic_cast<libmaus2::suffixsort::bwtb3m::MergeStrategyMergeInternalBlock *>(p) )
 						{
 							mergeBlocks(
@@ -4140,6 +4104,10 @@ namespace libmaus2
 						if ( logstr )
 							(*logstr) << "[M"<< (mcnt++) << "] " << libmaus2::util::MemUsage() << " " << libmaus2::autoarray::AutoArrayMemUsage() << std::endl;
 						#endif
+
+						// std::size_t const gt_produced = p->sortresult.getFiles().getGT().size();
+
+						// std::cerr << "gt_expected=" << gt_expected << " gt_produced=" << gt_produced << std::endl;
 					}
 
 					uint64_t const memperthread = (options.mem + options.numthreads-1)/options.numthreads;


=====================================
src/libmaus2/suffixsort/bwtb3m/MergeStrategyBaseBlock.hpp
=====================================
@@ -33,6 +33,11 @@ namespace libmaus2
 				libmaus2::suffixsort::BwtMergeBlockSortRequest sortreq;
 				std::vector<uint64_t> querypos;
 
+				std::size_t getNumGtTempFilesRequired() const
+				{
+					return sortresult.getFiles().getGT().size();
+				}
+
 				bool equal(MergeStrategyBlock const & O) const
 				{
 					if ( dynamic_cast<MergeStrategyBaseBlock const *>(&O) == 0 )


=====================================
src/libmaus2/suffixsort/bwtb3m/MergeStrategyBlock.hpp
=====================================
@@ -238,6 +238,7 @@ namespace libmaus2
 				virtual void fillNodeMap(std::map<uint64_t, MergeStrategyBlock *> & M) = 0;
 				virtual void setParentId(int64_t const parentid) = 0;
 				virtual void collectBaseBlockIds(std::vector<uint64_t> & V) = 0;
+				virtual std::size_t getNumGtTempFilesRequired() const = 0;
 			};
 
 			inline std::ostream & operator<<(std::ostream & out, MergeStrategyBlock const & MSB)


=====================================
src/libmaus2/suffixsort/bwtb3m/MergeStrategyMergeBlock.hpp
=====================================
@@ -41,6 +41,15 @@ namespace libmaus2
 				/* number of children unfinished */
 				uint64_t unfinishedChildren;
 
+				std::size_t getNumGtTempFilesRequired() const {
+					std::size_t n = 0;
+					assert ( children.size() );
+					n += children.front()->getNumGtTempFilesRequired();
+					for ( auto p : gaprequests )
+						n += p->getNumGtTempFilesRequired();
+					return n;
+				}
+
 				void fillNodeMap(std::map<uint64_t, MergeStrategyBlock *> & M)
 				{
 					M[nodeid] = this;


=====================================
src/libmaus2/suffixsort/bwtb3m/MergeStrategyMergeGapRequest.hpp
=====================================
@@ -38,6 +38,11 @@ namespace libmaus2
 				uint64_t into;
 				std::vector < ::libmaus2::suffixsort::BwtMergeZBlock > zblocks;
 
+				std::size_t getNumGtTempFilesRequired() const
+				{
+					return zblocks.size();
+				}
+
 				bool operator==(MergeStrategyMergeGapRequest const & O) const
 				{
 					if ( into != O.into )


=====================================
src/libmaus2/suffixsort/bwtb3m/MergeTree.hpp
=====================================
@@ -147,6 +147,24 @@ namespace libmaus2
 
 					return true;
 				}
+
+				void getMergeBlocks(
+					std::vector < MergeStrategyMergeInternalBlock * > & V_internal,
+					std::vector < MergeStrategyMergeInternalSmallBlock * > & V_internal_small,
+					std::vector < MergeStrategyMergeExternalBlock * > & V_external
+				)
+				{
+					for ( auto P : node_map ) {
+						auto p = P.second;
+
+						if ( dynamic_cast<MergeStrategyMergeInternalBlock *>(p) )
+							V_internal.push_back(dynamic_cast<MergeStrategyMergeInternalBlock *>(p));
+						else if ( dynamic_cast<MergeStrategyMergeInternalSmallBlock *>(p) )
+							V_internal_small.push_back(dynamic_cast<MergeStrategyMergeInternalSmallBlock *>(p));
+						else if ( dynamic_cast<MergeStrategyMergeExternalBlock *>(p) )
+							V_external.push_back(dynamic_cast<MergeStrategyMergeExternalBlock *>(p));
+					}
+				}
 			};
 
 			inline std::ostream & operator<<(std::ostream & out, MergeTree const & MT)



View it on GitLab: https://salsa.debian.org/med-team/libmaus2/-/commit/f487c23719ada369798f125fb6b89e15a8dfb4d0

-- 
View it on GitLab: https://salsa.debian.org/med-team/libmaus2/-/commit/f487c23719ada369798f125fb6b89e15a8dfb4d0
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220224/3e98ef3a/attachment-0001.htm>


More information about the debian-med-commit mailing list