[med-svn] [Git][med-team/libmaus2][upstream] New upstream version 2.0.810+ds
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Thu Feb 24 20:12:12 GMT 2022
Étienne Mollier pushed to branch upstream at Debian Med / libmaus2
Commits:
f487c237 by Étienne Mollier at 2022-02-24T20:33:22+01:00
New upstream version 2.0.810+ds
- - - - -
10 changed files:
- ChangeLog
- configure.ac
- src/libmaus2/suffixsort/BwtMergeBlockSortResult.hpp
- src/libmaus2/suffixsort/BwtMergeTempFileNameSet.hpp
- src/libmaus2/suffixsort/bwtb3m/BwtMergeSortTemplate.hpp
- src/libmaus2/suffixsort/bwtb3m/MergeStrategyBaseBlock.hpp
- src/libmaus2/suffixsort/bwtb3m/MergeStrategyBlock.hpp
- src/libmaus2/suffixsort/bwtb3m/MergeStrategyMergeBlock.hpp
- src/libmaus2/suffixsort/bwtb3m/MergeStrategyMergeGapRequest.hpp
- src/libmaus2/suffixsort/bwtb3m/MergeTree.hpp
Changes:
=====================================
ChangeLog
=====================================
@@ -1,3 +1,9 @@
+libmaus2 (2.0.810-1) unstable; urgency=medium
+
+ * Apply some improvements in BWT construction
+
+ -- German Tischler-Höhle <germant at miltenyibiotec.de> Wed, 16 Feb 2022 14:51:33 +0100
+
libmaus2 (2.0.809-1) unstable; urgency=medium
* Improve cleanup in BwtMergeSortTemplate
=====================================
configure.ac
=====================================
@@ -1,5 +1,5 @@
-AC_INIT(libmaus2,2.0.809,[germant at miltenyibiotec.de],[libmaus2],[https://gitlab.com/german.tischler/libmaus2])
-LIBRARY_VERSION=2:809:0
+AC_INIT(libmaus2,2.0.810,[germant at miltenyibiotec.de],[libmaus2],[https://gitlab.com/german.tischler/libmaus2])
+LIBRARY_VERSION=2:810:0
AC_MSG_NOTICE([Configuring for source in directory ${srcdir}])
AC_CANONICAL_SYSTEM
AC_CANONICAL_HOST
=====================================
src/libmaus2/suffixsort/BwtMergeBlockSortResult.hpp
=====================================
@@ -99,6 +99,10 @@ namespace libmaus2
void removeFilesButBwtAndGt() const { files.removeFilesButBwtAndGt(); }
void setTempPrefixAndRegisterAsTemp(libmaus2::util::TempFileNameGenerator & gtmpgen, uint64_t const numbwt, uint64_t const numgt, uint64_t const numisa)
{ files.setPrefixAndRegisterAsTemp(gtmpgen,numbwt,numgt,numisa); }
+ void setTempPrefixSingleAndRegisterAsTemp(libmaus2::util::TempFileNameGenerator & gtmpgen)
+ { files.setPrefixSingleAndRegisterAsTemp(gtmpgen); }
+ void setTempPrefixSingle(std::string const & dir_name)
+ { files.setTempPrefixSingle(dir_name); }
libmaus2::autoarray::AutoArray < ::libmaus2::suffixsort::BwtMergeZBlock > const & getZBlocks() const { return zblocks; }
void resizeZBlocks(uint64_t const n) { zblocks.resize(n); }
void setZBlock(uint64_t const i, ::libmaus2::suffixsort::BwtMergeZBlock const & z) { zblocks.at(i) = z; }
=====================================
src/libmaus2/suffixsort/BwtMergeTempFileNameSet.hpp
=====================================
@@ -165,10 +165,58 @@ namespace libmaus2
void setSampledISA(std::string const & rsampledisa) { sampledisa = std::vector<std::string>(1,rsampledisa); }
void setSampledISA(std::vector<std::string> const & rsampledisa) { sampledisa = rsampledisa; }
+ void setTempPrefixSingle(std::string const & dir_name) {
+ std::filesystem::path const p_dir_name(dir_name);
+
+ std::filesystem::path const p_hwt_req(p_dir_name / "block.hwtreq");
+ std::filesystem::path const p_hwt(p_dir_name / "block.hwt");
+ std::filesystem::path const p_hist(p_dir_name / "block.hist");
+ std::filesystem::path const p_hist_freq(p_dir_name / "block.histfreq");
+
+ setHWTReq (p_hwt_req.string());
+ setHWT (p_hwt.string());
+ setHist (p_hist.string());
+ setHistFreq(p_hist_freq.string());
+ }
+
+ void setPrefixSingle(libmaus2::util::TempFileNameGenerator & gtmpgen)
+ {
+ /**
+ * number of files: 4
+ **/
+ std::string const thwtreq = gtmpgen.getFileName()+".hwtreq";
+ setHWTReq(thwtreq);
+ std::string const thwt = gtmpgen.getFileName()+".hwt";
+ setHWT(thwt);
+ std::string const thist = gtmpgen.getFileName()+".hist";
+ setHist(thist);
+ std::string const thistfreq = gtmpgen.getFileName()+".histfreq";
+ setHistFreq(thistfreq);
+ }
+
+ void registerAsTempSingle()
+ {
+ ::libmaus2::util::TempFileRemovalContainer::addTempFile(getHWT());
+ ::libmaus2::util::TempFileRemovalContainer::addTempFile(getHWTReq());
+ ::libmaus2::util::TempFileRemovalContainer::addTempFile(getHist());
+ ::libmaus2::util::TempFileRemovalContainer::addTempFile(getHistFreq());
+ }
+
+ void setPrefixSingleAndRegisterAsTemp(libmaus2::util::TempFileNameGenerator & gtmpgen)
+ {
+ setPrefixSingle(gtmpgen);
+ registerAsTempSingle();
+ }
+
void setPrefix(libmaus2::util::TempFileNameGenerator & gtmpgen, uint64_t const numbwt, uint64_t const numgt, uint64_t const numisa)
{
+ /**
+ * number of files: 4 + numgt + numbwt + numisa
+ **/
+ setPrefixSingle(gtmpgen);
+
std::vector<std::string> gtfilenames(numgt);
- for ( uint64_t i = 0; i < numbwt; ++i )
+ for ( uint64_t i = 0; i < numgt; ++i )
{
std::ostringstream ostr;
ostr << gtmpgen.getFileName() << '_'
@@ -187,16 +235,7 @@ namespace libmaus2
<< ".bwt";
bwtfilenames[i] = ostr.str();
}
-
setBWT(bwtfilenames);
- std::string const thwtreq = gtmpgen.getFileName()+".hwtreq";
- setHWTReq(thwtreq);
- std::string const thwt = gtmpgen.getFileName()+".hwt";
- setHWT(thwt);
- std::string const thist = gtmpgen.getFileName()+".hist";
- setHist(thist);
- std::string const thistfreq = gtmpgen.getFileName()+".histfreq";
- setHist(thistfreq);
std::vector<std::string> isafilenames(numisa);
for ( uint64_t i = 0; i < numisa; ++i )
@@ -210,15 +249,18 @@ namespace libmaus2
setSampledISA(isafilenames);
}
+
void setPrefixAndRegisterAsTemp(libmaus2::util::TempFileNameGenerator & gtmpgen, uint64_t const numbwt, uint64_t const numgt, uint64_t const numisa)
{
+
setPrefix(gtmpgen, numbwt, numgt, numisa);
+
+ registerAsTempSingle();
+
for ( uint64_t i = 0; i < getGT().size(); ++i )
::libmaus2::util::TempFileRemovalContainer::addTempFile(getGT()[i]);
for ( uint64_t i = 0; i < getBWT().size(); ++i )
::libmaus2::util::TempFileRemovalContainer::addTempFile(getBWT()[i]);
- ::libmaus2::util::TempFileRemovalContainer::addTempFile(getHWT());
- ::libmaus2::util::TempFileRemovalContainer::addTempFile(getHist());
for ( uint64_t i = 0; i < getSampledISAVector().size(); ++i )
::libmaus2::util::TempFileRemovalContainer::addTempFile(getSampledISAVector()[i]);
}
@@ -311,7 +353,7 @@ namespace libmaus2
uint64_t const numgtfiles
)
{
- return numgtfiles + numbwtfiles + 1 + 1 + 1 + 1;
+ return numgtfiles + numbwtfiles + 1 /* hwtreq */ + 1 /* hwt */ + 1 /* hist */ + 1 /* hist.freq */ + 1 /* sample isa */;
}
=====================================
src/libmaus2/suffixsort/bwtb3m/BwtMergeSortTemplate.hpp
=====================================
@@ -1,6 +1,6 @@
/**
libmaus2
- Copyright (C) 2009-2021 German Tischler-Höhle
+ Copyright (C) 2009-2022 German Tischler-Höhle
Copyright (C) 2011-2014 Genome Research Limited
This program is free software: you can redistribute it and/or modify
@@ -108,13 +108,19 @@ namespace libmaus2
// array of computed LCP values between block and start of next block
std::shared_ptr< libmaus2::util::AtomicArray<uint64_t> > V_boundedlcpblockvalues;
+ // base tmp directory name
std::string tmpdirname;
+ // tmpdirname as path object
std::filesystem::path tmppath;
- std::filesystem::path ds_tmp_path;
std::filesystem::path base_tmp_path;
std::filesystem::path ds_tmp_path_base_ds_tmp;
+ std::filesystem::path merge_tmp_path;
+ std::filesystem::path ds_tmp_path_merge_ds_tmp;
std::unique_ptr<libmaus2::util::DirectoryStructure> DSbase;
+ std::unique_ptr<libmaus2::util::DirectoryStructure> DSmerge;
+ std::map < MergeStrategyBlock *, std::string > M_merge_dirs;
+ std::map < MergeStrategyBlock *, std::size_t> M_merge_gt_expected;
std::unique_ptr< ::libmaus2::suffixsort::BwtMergeTempFileNameSetVector > blocktmpnames;
@@ -125,6 +131,8 @@ namespace libmaus2
libmaus2::huffman::HuffmanTree::unique_ptr_type uhnode;
+ std::shared_ptr<libmaus2::suffixsort::bwtb3m::MergeTree> merge_tree;
+
static void serialiseNumber(std::ostream & ostr, uint64_t const n) {
libmaus2::util::NumberSerialisation::serialiseNumber(ostr,n);
}
@@ -154,14 +162,23 @@ namespace libmaus2
libmaus2::util::StringSerialisation::serialiseString(ostr,tmpdirname);
libmaus2::util::StringSerialisation::serialiseString(ostr,tmppath.string());
- libmaus2::util::StringSerialisation::serialiseString(ostr,ds_tmp_path.string());
+
libmaus2::util::StringSerialisation::serialiseString(ostr,base_tmp_path.string());
libmaus2::util::StringSerialisation::serialiseString(ostr,ds_tmp_path_base_ds_tmp.string());
+ libmaus2::util::StringSerialisation::serialiseString(ostr,merge_tmp_path.string());
+ libmaus2::util::StringSerialisation::serialiseString(ostr,ds_tmp_path_merge_ds_tmp.string());
+
libmaus2::util::NumberMapSerialisation::serialiseMap<std::ostream,int64_t,uint64_t>(ostr,chistnoterm);
libmaus2::util::NumberMapSerialisation::serialiseMap<std::ostream,int64_t,uint64_t>(ostr,chist);
libmaus2::util::NumberSerialisation::serialiseSignedNumber(ostr,bwtterm);
libmaus2::util::NumberSerialisation::serialiseNumber(ostr,maxsym);
+
+ if ( merge_tree )
+ {
+ libmaus2::util::NumberSerialisation::serialiseNumber(ostr,1);
+ merge_tree->serialise(ostr);
+ }
}
void deserialise(std::istream & istr)
@@ -191,10 +208,13 @@ namespace libmaus2
tmpdirname = libmaus2::util::StringSerialisation::deserialiseString(istr);
tmppath = libmaus2::util::StringSerialisation::deserialiseString(istr);
- ds_tmp_path = libmaus2::util::StringSerialisation::deserialiseString(istr);
+
base_tmp_path = libmaus2::util::StringSerialisation::deserialiseString(istr);
ds_tmp_path_base_ds_tmp = libmaus2::util::StringSerialisation::deserialiseString(istr);
+ merge_tmp_path = libmaus2::util::StringSerialisation::deserialiseString(istr);
+ ds_tmp_path_merge_ds_tmp = libmaus2::util::StringSerialisation::deserialiseString(istr);
+
std::unique_ptr<libmaus2::util::DirectoryStructure> t_DSbase(
BwtMergeTempFileNameSetVector::getDirectoryStructure(
ds_tmp_path_base_ds_tmp.string(),
@@ -221,6 +241,18 @@ namespace libmaus2
if ( chist.size() )
computeHuffmanTree();
+
+ bool const have_merge_tree = libmaus2::util::NumberSerialisation::deserialiseNumber(istr) != 0;
+
+ if ( have_merge_tree )
+ {
+ std::shared_ptr<libmaus2::suffixsort::bwtb3m::MergeTree> t_merge_tree(
+ new libmaus2::suffixsort::bwtb3m::MergeTree(istr)
+ );
+ merge_tree = t_merge_tree;
+
+ setupMergeDirectoryStructure();
+ }
}
static uint64_t getFileSize(
@@ -325,11 +357,14 @@ namespace libmaus2
blocksizeprevtwo((blocksize == blocksizenexttwo) ? blocksize : (blocksizenexttwo / 2)),
preisasamplingrate(std::min(options.maxpreisasamplingrate,blocksizeprevtwo)),
V_boundedlcpblockvalues(new libmaus2::util::AtomicArray<uint64_t>(numblocks,0)),
+ // tmp directory name
tmpdirname(ensureDirectory(options.tmpfilenamebase + "_tmpdir")),
+ // path object for tmp directory name
tmppath(tmpdirname),
- ds_tmp_path(ensureDirectory(tmppath / "ds_tmp")),
base_tmp_path(ensureDirectory(tmppath / "base_tmp")),
ds_tmp_path_base_ds_tmp(tmppath / "base_ds.tmp"),
+ merge_tmp_path(ensureDirectory(tmppath / "merge_tmp")),
+ ds_tmp_path_merge_ds_tmp(tmppath / "merge_ds.tmp"),
DSbase(
BwtMergeTempFileNameSetVector::getDirectoryStructure(
ds_tmp_path_base_ds_tmp.string(),
@@ -340,6 +375,7 @@ namespace libmaus2
options.numthreads /* gt */
)
),
+ DSmerge(),
blocktmpnames(
new ::libmaus2::suffixsort::BwtMergeTempFileNameSetVector(*DSbase, getNumBlocks(), options.numthreads /* bwt */, options.numthreads /* gt */)
),
@@ -356,12 +392,6 @@ namespace libmaus2
// there should be at least one block as input size is not zero
assert ( numblocks );
- #if 0
- libmaus2::aio::OutputStreamFactoryContainer::mkdirp(tmpdirname,0700);
- libmaus2::aio::OutputStreamFactoryContainer::mkdirp(ds_tmp_path.string(),0700);
- libmaus2::aio::OutputStreamFactoryContainer::mkdirp(base_tmp_path.string(),0700);
- #endif
-
DSbase->doGenerate();
// std::cerr << "blocktmpnames=\n" << blocktmpnames->toString();
@@ -376,9 +406,17 @@ namespace libmaus2
void cleanup()
{
+ if ( DSmerge ) {
+ for ( auto const & P : M_merge_dirs )
+ libmaus2::aio::OutputStreamFactoryContainer::rmdir(P.second);
+ DSmerge->doRemove();
+ }
+ else
+ libmaus2::aio::OutputStreamFactoryContainer::rmdir(merge_tmp_path);
+
DSbase->doRemove();
libmaus2::aio::FileRemoval::removeFile(ds_tmp_path_base_ds_tmp.string());
- libmaus2::aio::OutputStreamFactoryContainer::rmdir(ds_tmp_path.string());
+ libmaus2::aio::FileRemoval::removeFile(ds_tmp_path_merge_ds_tmp.string());
libmaus2::aio::OutputStreamFactoryContainer::rmdir(tmpdirname);
}
@@ -530,6 +568,99 @@ namespace libmaus2
huftreeCOS->flush();
huftreeCOS.reset();
}
+
+ std::size_t getMergeGTExpected(MergeStrategyBlock * p) const
+ {
+ auto it = M_merge_gt_expected.find(p);
+
+ if ( it == M_merge_gt_expected.end() ) {
+ libmaus2::exception::LibMausException lme;
+ lme.getStream() << "BwtMergeState::getMergeGTExpected: unable to find node" << std::endl;
+ lme.finish();
+ throw lme;
+ }
+
+ return it->second;
+ }
+
+ std::string getDirectoryForMerge(MergeStrategyBlock * p) const
+ {
+ auto it = M_merge_dirs.find(p);
+
+ if ( it == M_merge_dirs.end() ) {
+ libmaus2::exception::LibMausException lme;
+ lme.getStream() << "BwtMergeState::getDirectoryForMerge: unable to find node" << std::endl;
+ lme.finish();
+ throw lme;
+ }
+
+ return it->second;
+ }
+
+ void fillGTExpected()
+ {
+ std::vector < MergeStrategyMergeInternalBlock * > V_internal;
+ std::vector < MergeStrategyMergeInternalSmallBlock * > V_internal_small;
+ std::vector < MergeStrategyMergeExternalBlock * > V_external;
+
+ merge_tree->getMergeBlocks(V_internal,V_internal_small,V_external);
+
+ for ( auto * p : V_internal )
+ M_merge_gt_expected[p] = p->getNumGtTempFilesRequired();
+ for ( auto * p : V_internal_small )
+ M_merge_gt_expected[p] = p->getNumGtTempFilesRequired();
+ for ( auto * p : V_external )
+ M_merge_gt_expected[p] = p->getNumGtTempFilesRequired();
+ }
+
+ void setupMergeDirectoryStructure()
+ {
+ std::vector < MergeStrategyMergeInternalBlock * > V_internal;
+ std::vector < MergeStrategyMergeInternalSmallBlock * > V_internal_small;
+ std::vector < MergeStrategyMergeExternalBlock * > V_external;
+
+ merge_tree->getMergeBlocks(V_internal,V_internal_small,V_external);
+
+ std::size_t const num_merge_nodes = V_internal.size() + V_internal_small.size() + V_external.size();
+
+ std::unique_ptr<libmaus2::util::DirectoryStructure> tDS(
+ new libmaus2::util::DirectoryStructure(
+ ds_tmp_path_merge_ds_tmp.string(),
+ 64 /* mod */,
+ num_merge_nodes,
+ merge_tmp_path.string()
+ )
+ );
+
+ DSmerge = std::move(tDS);
+ DSmerge->setAsciiFlag(true);
+
+ std::size_t z = 0;
+ for ( auto * p : V_internal )
+ M_merge_dirs[p] = (*DSmerge)[z++];
+ for ( auto * p : V_internal_small )
+ M_merge_dirs[p] = (*DSmerge)[z++];
+ for ( auto * p : V_external )
+ M_merge_dirs[p] = (*DSmerge)[z++];
+ }
+
+ void setMergeTree(std::shared_ptr<libmaus2::suffixsort::bwtb3m::MergeTree> r_merge_tree)
+ {
+ merge_tree = r_merge_tree;
+
+ std::vector < MergeStrategyMergeInternalBlock * > V_internal;
+ std::vector < MergeStrategyMergeInternalSmallBlock * > V_internal_small;
+ std::vector < MergeStrategyMergeExternalBlock * > V_external;
+
+ merge_tree->getMergeBlocks(V_internal,V_internal_small,V_external);
+
+ setupMergeDirectoryStructure();
+ DSmerge->doGenerate();
+ for ( auto & P : M_merge_dirs ) {
+ libmaus2::aio::OutputStreamFactoryContainer::mkdir(P.second,0700);
+ P.first->sortresult.setTempPrefixSingle(P.second);
+ }
+ }
};
}
}
@@ -824,6 +955,13 @@ namespace libmaus2
int const verbose
)
{
+ std::vector < std::string > gtpartnames(zblocks.size());
+ for ( std::size_t z = 0; z < gtpartnames.size(); ++z ) {
+ std::string const gtpartname = gtmpgen.getFileName() + "_" + ::libmaus2::util::NumberSerialisation::formatNumber(z,4) + ".gt";
+ ::libmaus2::util::TempFileRemovalContainer::addTempFile(gtpartname);
+ gtpartnames[z] = gtpartname;
+ }
+
// gap array
uint64_t const Gsize = cblocksize+1;
@@ -903,7 +1041,6 @@ namespace libmaus2
zabsblockpos[z] = zblocks[z].getZAbsPos();
zabsblockpos [ zactive ] = blockstart + cblocksize;
- std::vector < std::string > gtpartnames(zactive);
if ( verbose >= 5 && logstr )
{
@@ -921,9 +1058,8 @@ namespace libmaus2
::libmaus2::suffixsort::BwtMergeZBlock const & zblock = zblocks[z];
- std::string const gtpartname = gtmpgen.getFileName() + "_" + ::libmaus2::util::NumberSerialisation::formatNumber(z,4) + ".gt";
- ::libmaus2::util::TempFileRemovalContainer::addTempFile(gtpartname);
- gtpartnames[z] = gtpartname;
+ std::string const gtpartname = gtpartnames.at(z);
+
#if 0
::libmaus2::huffman::HuffmanEncoderFileStd GTHEF(gtpartname);
#endif
@@ -962,7 +1098,7 @@ namespace libmaus2
libmaus2::parallel::StdSpinLock gslock;
#if defined(_OPENMP)
- #pragma omp parallel for
+ #pragma omp parallel for num_threads(numthreads)
#endif
for ( uint64_t t = 0; t < cblocks; ++t )
{
@@ -1000,7 +1136,6 @@ namespace libmaus2
uint64_t const fs, // length of text file in symbols
libmaus2::suffixsort::bwtb3m::MergeStrategyMergeGapRequest const & msmgr, // merge request
std::vector<std::string> const & mergedgtname, // previous gt file name
- //std::string const & newmergedgtname, // new gt file name
::libmaus2::lf::DArray * const accD, // accumulated symbol freqs for block
uint64_t const numthreads,
std::ostream * logstr,
@@ -1749,7 +1884,6 @@ namespace libmaus2
int const verbose
)
{
-
if ( logstr )
(*logstr) << "[V] Merging BWT blocks MergeStrategyMergeInternalBlock." << std::endl;
@@ -1785,7 +1919,7 @@ namespace libmaus2
// gt bit array,
// huffman shaped wavelet tree and
// histogram
- result.setTempPrefixAndRegisterAsTemp(gtmpgen,0 /* no preset bwt file names */, 0 /* no preset gt file names */, 0 /* no preset isa */);
+ // result.setTempPrefixSingleAndRegisterAsTemp(gtmpgen);
if ( verbose >= 5 && logstr )
{
@@ -1855,6 +1989,7 @@ namespace libmaus2
(*logstr) << "[V] renaming gt files" << std::endl;
}
+ // move gt files for left block to output by renaming them
std::vector<std::string> oldgtnames;
for ( uint64_t i = 0; i < blockresults.getFiles().getGT().size(); ++i )
{
@@ -1866,6 +2001,7 @@ namespace libmaus2
libmaus2::aio::OutputStreamFactoryContainer::rename(blockresults.getFiles().getGT()[i].c_str(), renamed.c_str());
}
+ // concatenate new gt files and old ones to obtain output gt files
result.setGT(stringVectorAppend(GACR.gtpartnames,oldgtnames));
::libmaus2::timing::RealTimeClock rtc; rtc.start();
@@ -1931,184 +2067,6 @@ namespace libmaus2
rtc.start();
- #if 0
- if ( verbose >= 5 && logstr )
- {
- (*logstr) << "[V] computing work packets" << std::endl;
- }
-
- uint64_t const logG = std::max(libmaus2::math::ilog(Gsize),static_cast<unsigned int>(1));
- uint64_t const logG2 = logG*logG;
- // target number of G samples
- uint64_t const tnumGsamp = std::max(Gsize / logG2,static_cast<uint64_t>(256*numthreads));
- uint64_t const Gsampleblocksize = (Gsize + tnumGsamp - 1) / tnumGsamp;
- // number of G samples
- uint64_t const numGsamp = (Gsize + Gsampleblocksize - 1) / Gsampleblocksize;
-
- libmaus2::autoarray::AutoArray < uint64_t > Gsamples(numGsamp,false);
-
- #if defined(_OPENMP)
- #pragma omp parallel for num_threads(numthreads)
- #endif
- for ( uint64_t t = 0; t < numGsamp; ++t )
- {
- uint64_t const low = t * Gsampleblocksize;
- uint64_t const high = std::min(low + Gsampleblocksize, Gsize);
- assert ( high >= low );
- uint64_t s = 0;
- G_array_iterator gp = GACR.G.begin() + low;
- for ( uint64_t i = low; i < high; ++i )
- s += *(gp++);
- s += (high-low);
- if ( high == Gsize && high != low )
- s -= 1;
- Gsamples[t] = s;
- }
-
- #if 0
- std::vector<uint64_t> G_A(Gsamples.begin(),Gsamples.end());
- std::vector<uint64_t> G_B(Gsamples.begin(),Gsamples.end());
-
- libmaus2::util::PrefixSums::prefixSums(G_A.begin(),G_A.end());
- libmaus2::util::PrefixSums::parallelPrefixSums(G_A.begin(),G_A.end(),numthreads);
- #endif
-
- uint64_t const Gsum = libmaus2::util::PrefixSums::parallelPrefixSums(Gsamples.begin(),Gsamples.end(),numthreads);
-
- if ( verbose >= 5 && logstr )
- {
- (*logstr) << "[V] G size " << Gsize << " number of G samples " << numGsamp << std::endl;
- }
-
- uint64_t const Gsumperthread = (Gsum + numthreads-1)/numthreads;
- std::vector < std::pair<uint64_t,uint64_t> > wpacks;
- wpacks = std::vector < std::pair<uint64_t,uint64_t> >(numthreads);
- #if defined(_OPENMP)
- #pragma omp parallel for num_threads(numthreads)
- #endif
- for ( uint64_t i = 0; i < numthreads; ++i )
- {
- uint64_t const target = i * Gsumperthread;
- uint64_t const * p = ::std::lower_bound(Gsamples.begin(),Gsamples.end(),target);
-
- if ( p == Gsamples.end() )
- --p;
- while ( *p > target )
- --p;
-
- assert ( *p <= target );
-
- uint64_t iv = (p - Gsamples.begin()) * Gsampleblocksize;
- uint64_t s = *p;
-
- G_array_iterator gp = GACR.G.begin() + iv;
- while ( s < target && iv < Gsize )
- {
- s += (*(gp++))+1;
- iv++;
- }
- if ( iv == Gsize )
- s -= 1;
-
- wpacks[i].first = iv;
- if ( i )
- wpacks[i-1].second = iv;
- // std::cerr << "i=" << i << " iv=" << iv << " Gsize=" << Gsize << std::endl;
- }
- wpacks.back().second = Gsize;
-
- // remove empty packages
- {
- uint64_t o = 0;
- for ( uint64_t i = 0; i < wpacks.size(); ++i )
- if ( wpacks[i].first != wpacks[i].second )
- wpacks[o++] = wpacks[i];
- wpacks.resize(o);
- }
-
- std::vector < uint64_t > P;
- P.resize(wpacks.size()+1);
- #if defined(_OPENMP)
- #pragma omp parallel for num_threads(numthreads)
- #endif
- for ( uint64_t i = 0; i < wpacks.size(); ++i )
- {
- uint64_t const low = wpacks[i].first;
- uint64_t const high = wpacks[i].second;
-
- G_array_iterator gp = GACR.G.begin() + low;
- uint64_t s = 0;
- for ( uint64_t i = low; i < high; ++i )
- s += *(gp++);
-
- P[i] = s;
-
- }
- libmaus2::util::PrefixSums::prefixSums(P.begin(),P.end());
-
- #if 0
- // std::cerr << "(computing work packets...";
- P.push_back(0);
- uint64_t ilow = 0;
- //
- uint64_t const totalsuf = result.getCBlockSize();
- // number of packets
- uint64_t const numpack = numthreads;
- // suffixes per thread
- uint64_t const tpacksize = (totalsuf + numpack-1)/numpack;
- while ( ilow != Gsize )
- {
- uint64_t s = 0;
- uint64_t ihigh = ilow;
-
- if ( verbose >= 5 && logstr )
- {
- (*logstr) << "[V] ilow=" << ilow << std::endl;
- }
-
- while ( ihigh != Gsize && s < tpacksize )
- s += (GACR.G[ihigh++]+1);
-
- uint64_t const p = s-(ihigh-ilow);
-
- if ( ihigh+1 == Gsize && GACR.G[ihigh] == 0 )
- ihigh++;
-
-
- if ( verbose >= 5 && logstr )
- {
- (*logstr) << "[V] ihigh=" << ilow << std::endl;
- }
-
- // std::cerr << "[" << ilow << "," << ihigh << ")" << std::endl;
-
- assert ( p == std::accumulate(GACR.G.begin()+ilow,GACR.G.begin()+ihigh,0ull) );
-
- if ( verbose >= 5 && logstr )
- {
- (*logstr) << "[V] accumulate check done" << std::endl;
- }
-
- P.push_back(P.back() + p);
- wpacks.push_back(std::pair<uint64_t,uint64_t>(ilow,ihigh));
- encfilenames.push_back(
- gtmpgen.getFileName()
- // result.getFiles().getBWT()
- + "_"
- + ::libmaus2::util::NumberSerialisation::formatNumber(encfilenames.size(),6)
- + ".bwt"
- );
- ::libmaus2::util::TempFileRemovalContainer::addTempFile(encfilenames.back());
- ilow = ihigh;
-
- if ( verbose >= 5 && logstr )
- {
- (*logstr) << "[V] end of single loop" << std::endl;
- }
- }
- #endif
- #endif
-
std::vector < std::string > encfilenames(wpacks.size());
for ( uint64_t i = 0; i < wpacks.size(); ++i )
{
@@ -2317,7 +2275,7 @@ namespace libmaus2
gapfilenames.push_back(newgapname);
}
- // bwt name
+ // create new names for the input bwt names
std::vector<std::string> newbwtnames;
for ( uint64_t i = 0; i < mergereq.children[bb]->sortresult.getFiles().getBWT().size(); ++i )
{
@@ -2377,7 +2335,6 @@ namespace libmaus2
GapArrayComputationResult const GACR = computeGapArray(
gtmpgen,fn,fs,*(mergereq.gaprequests[bx]),
mergedgtname,
- //newmergedgtname,
accD.get(),
numthreads,
logstr,
@@ -2574,7 +2531,7 @@ namespace libmaus2
// gt bit array,
// huffman shaped wavelet tree and
// histogram
- result.setTempPrefixAndRegisterAsTemp(gtmpgen,0 /* no preset bwt file names */, 0 /* no preset gt file names */, 0 /* no preset isa */);
+ // result.setTempPrefixSingleAndRegisterAsTemp(gtmpgen);
// if we merge only two blocks together, then we do not need to write the gap array to disk
if ( mergereq.children.size() == 2 )
@@ -3161,13 +3118,10 @@ namespace libmaus2
result.setCBlockSize ( 0 );
for ( uint64_t i = 0; i < mergereq.children.size(); ++i )
result.setCBlockSize( result.getCBlockSize() + mergereq.children[i]->sortresult.getCBlockSize() );
- // set up
- // filenames of output bwt,
- // sampled inverse suffix array filename,
- // gt bit array,
+ // set up file names for
// huffman shaped wavelet tree and
// histogram
- result.setTempPrefixAndRegisterAsTemp(gtmpgen,0,0,0);
+ // result.setTempPrefixSingleAndRegisterAsTemp(gtmpgen);
{
std::vector < std::vector < std::string > > gapfilenames;
@@ -3986,6 +3940,8 @@ namespace libmaus2
constructMergeTree(logstr,options,state,fn,rlencoderblocksize)
);
+ state.setMergeTree(mergetree);
+
// inner node queue
std::deque<uint64_t> itodo;
@@ -4033,6 +3989,8 @@ namespace libmaus2
assert ( mergetree->checkSerialisation() );
+ state.fillGTExpected();
+
if ( options.verbose >= 5 && logstr )
*logstr << "[V] checked serialisation after filling gap request objects" << std::endl;
@@ -4072,6 +4030,10 @@ namespace libmaus2
(*logstr) << std::endl;
}
+ std::string const l_merge_tmp_dir = state.getDirectoryForMerge(p);
+
+ // p->sortresult.setTempPrefixSingle(l_merge_tmp_dir);
+
#if 0
std::ostringstream tmpstr;
tmpstr << options.tmpfilenamebase << "_" << std::setfill('0') << std::setw(6) << (mtmpid++);
@@ -4080,6 +4042,8 @@ namespace libmaus2
std::ostringstream sparsetmpstr;
sparsetmpstr << options.sparsetmpfilenamebase << "_" << std::setfill('0') << std::setw(6) << (mtmpid++);
+ // std::size_t const gt_expected = state.getMergeGTExpected(p);
+
if ( dynamic_cast<libmaus2::suffixsort::bwtb3m::MergeStrategyMergeInternalBlock *>(p) )
{
mergeBlocks(
@@ -4140,6 +4104,10 @@ namespace libmaus2
if ( logstr )
(*logstr) << "[M"<< (mcnt++) << "] " << libmaus2::util::MemUsage() << " " << libmaus2::autoarray::AutoArrayMemUsage() << std::endl;
#endif
+
+ // std::size_t const gt_produced = p->sortresult.getFiles().getGT().size();
+
+ // std::cerr << "gt_expected=" << gt_expected << " gt_produced=" << gt_produced << std::endl;
}
uint64_t const memperthread = (options.mem + options.numthreads-1)/options.numthreads;
=====================================
src/libmaus2/suffixsort/bwtb3m/MergeStrategyBaseBlock.hpp
=====================================
@@ -33,6 +33,11 @@ namespace libmaus2
libmaus2::suffixsort::BwtMergeBlockSortRequest sortreq;
std::vector<uint64_t> querypos;
+ std::size_t getNumGtTempFilesRequired() const
+ {
+ return sortresult.getFiles().getGT().size();
+ }
+
bool equal(MergeStrategyBlock const & O) const
{
if ( dynamic_cast<MergeStrategyBaseBlock const *>(&O) == 0 )
=====================================
src/libmaus2/suffixsort/bwtb3m/MergeStrategyBlock.hpp
=====================================
@@ -238,6 +238,7 @@ namespace libmaus2
virtual void fillNodeMap(std::map<uint64_t, MergeStrategyBlock *> & M) = 0;
virtual void setParentId(int64_t const parentid) = 0;
virtual void collectBaseBlockIds(std::vector<uint64_t> & V) = 0;
+ virtual std::size_t getNumGtTempFilesRequired() const = 0;
};
inline std::ostream & operator<<(std::ostream & out, MergeStrategyBlock const & MSB)
=====================================
src/libmaus2/suffixsort/bwtb3m/MergeStrategyMergeBlock.hpp
=====================================
@@ -41,6 +41,15 @@ namespace libmaus2
/* number of children unfinished */
uint64_t unfinishedChildren;
+ std::size_t getNumGtTempFilesRequired() const {
+ std::size_t n = 0;
+ assert ( children.size() );
+ n += children.front()->getNumGtTempFilesRequired();
+ for ( auto p : gaprequests )
+ n += p->getNumGtTempFilesRequired();
+ return n;
+ }
+
void fillNodeMap(std::map<uint64_t, MergeStrategyBlock *> & M)
{
M[nodeid] = this;
=====================================
src/libmaus2/suffixsort/bwtb3m/MergeStrategyMergeGapRequest.hpp
=====================================
@@ -38,6 +38,11 @@ namespace libmaus2
uint64_t into;
std::vector < ::libmaus2::suffixsort::BwtMergeZBlock > zblocks;
+ std::size_t getNumGtTempFilesRequired() const
+ {
+ return zblocks.size();
+ }
+
bool operator==(MergeStrategyMergeGapRequest const & O) const
{
if ( into != O.into )
=====================================
src/libmaus2/suffixsort/bwtb3m/MergeTree.hpp
=====================================
@@ -147,6 +147,24 @@ namespace libmaus2
return true;
}
+
+ void getMergeBlocks(
+ std::vector < MergeStrategyMergeInternalBlock * > & V_internal,
+ std::vector < MergeStrategyMergeInternalSmallBlock * > & V_internal_small,
+ std::vector < MergeStrategyMergeExternalBlock * > & V_external
+ )
+ {
+ for ( auto P : node_map ) {
+ auto p = P.second;
+
+ if ( dynamic_cast<MergeStrategyMergeInternalBlock *>(p) )
+ V_internal.push_back(dynamic_cast<MergeStrategyMergeInternalBlock *>(p));
+ else if ( dynamic_cast<MergeStrategyMergeInternalSmallBlock *>(p) )
+ V_internal_small.push_back(dynamic_cast<MergeStrategyMergeInternalSmallBlock *>(p));
+ else if ( dynamic_cast<MergeStrategyMergeExternalBlock *>(p) )
+ V_external.push_back(dynamic_cast<MergeStrategyMergeExternalBlock *>(p));
+ }
+ }
};
inline std::ostream & operator<<(std::ostream & out, MergeTree const & MT)
View it on GitLab: https://salsa.debian.org/med-team/libmaus2/-/commit/f487c23719ada369798f125fb6b89e15a8dfb4d0
--
View it on GitLab: https://salsa.debian.org/med-team/libmaus2/-/commit/f487c23719ada369798f125fb6b89e15a8dfb4d0
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220224/3e98ef3a/attachment-0001.htm>
More information about the debian-med-commit
mailing list