[med-svn] [Git][med-team/minimac4][upstream] New upstream version 4.1.6
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Fri Dec 8 12:19:03 GMT 2023
Étienne Mollier pushed to branch upstream at Debian Med / minimac4
Commits:
280b8344 by Étienne Mollier at 2023-12-08T13:13:32+01:00
New upstream version 4.1.6
- - - - -
6 changed files:
- CMakeLists.txt
- src/input_prep.cpp
- src/input_prep.hpp
- src/main.cpp
- src/prog_args.hpp
- src/unique_haplotype.cpp
Changes:
=====================================
CMakeLists.txt
=====================================
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.2)
-project(minimac4 VERSION 4.1.5)
+project(minimac4 VERSION 4.1.6)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose build type (Debug|Release|RelWithDebInfo|MinSizeRel)" FORCE)
=====================================
src/input_prep.cpp
=====================================
@@ -705,9 +705,15 @@ bool convert_old_m3vcf(const std::string& input_path, const std::string& output_
return !input_file.bad() && output_file.good();
}
-bool compress_reference_panel(const std::string& input_path, const std::string& output_path, const std::string& map_file_path)
+bool compress_reference_panel(const std::string& input_path, const std::string& output_path,
+ std::size_t min_block_size,
+ std::size_t max_block_size,
+ std::size_t slope_unit,
+ const std::string& map_file_path)
{
savvy::reader input_file(input_path);
+ if (!input_file)
+ return std::cerr << "Error: could not open input file\n", false;
savvy::variant var;
std::vector<std::int8_t> gts;
@@ -744,35 +750,54 @@ bool compress_reference_panel(const std::string& input_path, const std::string&
headers.emplace_back(it->first, it->second);
}
- if (!input_file.read(var) || !var.get_format("GT", gts))
- return std::cerr << "Error: could not read GT from first variant record in input file\n", false;
+ std::string last_3;
+ if (output_path.size() >= 3)
+ last_3 = output_path.substr(output_path.size() - 3);
+ savvy::writer output_file(output_path, last_3 == "bcf" ? savvy::file::format::bcf : savvy::file::format::sav, headers, input_file.samples(), 6);
+
+ //if (!input_file.read(var))
+ // return input_file.bad() ? std::cerr << "Error: read failure on first record\n", false : true;
+
+ //if (!var.get_format("GT", gts))
+ // return std::cerr << "Error: could not read GT from first variant record in input file\n", false;
- if (header_contigs.find(var.chrom()) == header_contigs.end())
- headers.emplace_back("contig", "<ID=" + var.chrom() + ">");
+ //if (header_contigs.find(var.chrom()) == header_contigs.end())
+ // headers.emplace_back("contig", "<ID=" + var.chrom() + ">");
float flt_nan = std::numeric_limits<float>::quiet_NaN();
unique_haplotype_block block;
- block.compress_variant(reference_site_info(var.chrom(), var.pos(), var.id(), var.ref(), var.alts().size() ? var.alts()[0] : "", flt_nan, flt_nan, std::numeric_limits<double>::quiet_NaN()), gts);
- std::size_t variant_cnt = 1;
+ //block.compress_variant(reference_site_info(var.chrom(), var.pos(), var.id(), var.ref(), var.alts().size() ? var.alts()[0] : "", flt_nan, flt_nan, std::numeric_limits<double>::quiet_NaN()), gts);
+ std::size_t variant_cnt = 0;
std::size_t block_cnt = 0;
- std::string last_3;
- if (output_path.size() >= 3)
- last_3 = output_path.substr(output_path.size() - 3);
- savvy::writer output_file(output_path, last_3 == "bcf" ? savvy::file::format::bcf : savvy::file::format::sav, headers, input_file.samples(), 6);
+ //std::string last_3;
+ //if (output_path.size() >= 3)
+ // last_3 = output_path.substr(output_path.size() - 3);
+ //savvy::writer output_file(output_path, last_3 == "bcf" ? savvy::file::format::bcf : savvy::file::format::sav, headers, input_file.samples(), 6);
auto comp_ratio = [](const unique_haplotype_block& b)
{
- return float(b.expanded_haplotype_size() + b.unique_haplotype_size() * b.variant_size()) / float(b.expanded_haplotype_size() * b.variant_size());
+ return double(b.expanded_haplotype_size() + b.unique_haplotype_size() * b.variant_size()) / double(b.expanded_haplotype_size() * b.variant_size());
};
- const std::size_t min_block_size = 10;
- const std::size_t max_block_size = 0xFFFF; // max s1r block size minus 1 partition record
-
- bool flush_block = false;
+ //const std::size_t min_block_size = 400; //10;
+ //const std::size_t max_block_size = 400; //0xFFFF; // max s1r block size minus 1 partition record
+ std::string prev_chrom;
+ //std::size_t slope_unit = 10;
+ double cr_sum = 0.;
+ double cr_min = 2.;
+ double cr_max = 0.;
+ double prev_cr = 2.;
while (input_file >> var)
{
- float old_cr = comp_ratio(block);
+ if (var.chrom() != prev_chrom && block.variant_size())
+ {
+ if (!block.serialize(output_file))
+ return std::cerr << "Error: serializing block failed\n", false;
+ block.clear();
+ ++block_cnt;
+ }
+
if (!var.get_format("GT", gts))
return std::cerr << "Error: could not read GT from variant record\n", false;
@@ -786,22 +811,32 @@ bool compress_reference_panel(const std::string& input_path, const std::string&
++variant_cnt;
+ bool flush_block = false;
std::size_t cnt = block.variant_size();
+ double new_cr = comp_ratio(block);
if (cnt >= min_block_size)
{
- float new_cr = comp_ratio(block);
- if (new_cr > old_cr || cnt >= max_block_size)
- {
- // CM INFO field would need to be a double in order to have enough precision
- // if (map_file)
- // block.fill_cm(*map_file);
-
- if (!block.serialize(output_file))
- return std::cerr << "Error: serializing block failed\n", false;
- block.clear();
- ++block_cnt;
- }
+ if ((cnt % slope_unit == 0 && new_cr > prev_cr) || cnt >= max_block_size)
+ flush_block = true;
}
+
+ if (flush_block)
+ {
+ if (!block.serialize(output_file))
+ return std::cerr << "Error: serializing block failed\n", false;
+ block.clear();
+ ++block_cnt;
+ if (new_cr < cr_min) cr_min = new_cr;
+ if (new_cr > cr_max) cr_max = new_cr;
+ cr_sum += new_cr;
+ prev_cr=2.;
+ }
+ else if (cnt % slope_unit == 0)
+ {
+ prev_cr = new_cr;
+ }
+
+ prev_chrom = var.chrom();
}
// CM INFO field would need to be a double in order to have enough precision
@@ -810,10 +845,18 @@ bool compress_reference_panel(const std::string& input_path, const std::string&
if (block.variants().size())
{
+ double cr = comp_ratio(block);
+ if (cr < cr_min) cr_min = cr;
+ if (cr > cr_max) cr_max = cr;
+ cr_sum += comp_ratio(block);
if (!block.serialize(output_file))
return std::cerr << "Error: serializing final block failed\n", false;
++block_cnt;
}
+ std::cerr << "Mean Compression Ratio: " << cr_sum / block_cnt << std::endl;
+ std::cerr << "Min Compression Ratio: " << cr_min << std::endl;
+ std::cerr << "Max Compression Ratio: " << cr_max << std::endl;
+
return !input_file.bad() && output_file.good();
}
=====================================
src/input_prep.hpp
=====================================
@@ -38,7 +38,11 @@ bool load_variant_hmm_params(std::vector<target_variant>& tar_variants, reduced_
std::vector<std::vector<std::vector<std::size_t>>> generate_reverse_maps(const reduced_haplotypes& typed_only_reference_data);
bool convert_old_m3vcf(const std::string& input_path, const std::string& output_path, const std::string& map_file_path = "");
-bool compress_reference_panel(const std::string& input_path, const std::string& output_path, const std::string& map_file_path = "");
+bool compress_reference_panel(const std::string& input_path, const std::string& output_path,
+ std::size_t min_block_size = 10,
+ std::size_t max_block_size = 0xFFFF, // max s1r block size minus 1 partition record
+ std::size_t slope_unit = 10,
+ const std::string& map_file_path = "");
#endif // MINIMAC4_INPUT_PREP_HPP
=====================================
src/main.cpp
=====================================
@@ -268,7 +268,7 @@ int main(int argc, char** argv)
return convert_old_m3vcf(args.ref_path(), args.out_path(), args.map_path()) ? EXIT_SUCCESS : EXIT_FAILURE;
if (args.compress_reference())
- return compress_reference_panel(args.ref_path(), args.out_path(), args.map_path()) ? EXIT_SUCCESS : EXIT_FAILURE;
+ return compress_reference_panel(args.ref_path(), args.out_path(), args.min_block_size(), args.max_block_size(), args.slope_unit(), args.map_path()) ? EXIT_SUCCESS : EXIT_FAILURE;
std::uint64_t end_pos = args.region().to();
std::string chrom = args.region().chromosome();
=====================================
src/prog_args.hpp
=====================================
@@ -30,6 +30,9 @@ private:
std::unordered_set<std::string> sample_ids_;
savvy::genomic_region reg_ = {""};
std::size_t temp_buffer_ = 200;
+ std::size_t min_block_size_ = 10;
+ std::size_t max_block_size_ = 0xFFFF; // max s1r block size minus 1 partition record
+ std::size_t slope_unit_ = 10;
std::int64_t chunk_size_ = 20000000;
std::int64_t overlap_ = 3000000;
std::int16_t threads_ = 1;
@@ -70,6 +73,9 @@ public:
std::int64_t overlap() const { return overlap_; }
std::int16_t threads() const { return threads_; }
std::size_t temp_buffer() const { return temp_buffer_ ; }
+ std::size_t min_block_size() const { return min_block_size_; }
+ std::size_t max_block_size() const { return max_block_size_; }
+ std::size_t slope_unit() const { return slope_unit_; }
float decay() const { return decay_; }
float min_r2() const { return min_r2_; }
float min_ratio() const { return min_ratio_; }
@@ -119,6 +125,9 @@ public:
{"temp-prefix", required_argument, 0, '\x02', "Prefix path for temporary output files (default: ${TMPDIR}/m4_)"},
{"update-m3vcf", no_argument, 0, '\x01', "Converts M3VCF to MVCF (default output: /dev/stdout)"},
{"compress-reference", no_argument, 0, '\x01', "Compresses VCF to MVCF (default output: /dev/stdout)"},
+ {"min-block-size", required_argument, 0, '\x02', "Minimium block size for unique haplotype compression (default: 10)"},
+ {"max-block-size", required_argument, 0, '\x02', "Maximum block size for unique haplotype compression (default: 65535)"},
+ {"slope-unit", required_argument, 0, '\x02', "Parameter for unique haplotype compression heuristic (default: 10)"},
// vvvv deprecated vvvv //
{"allTypedSites", no_argument, 0, '\x01', nullptr},
{"rsid", no_argument, 0, '\x01', nullptr},
@@ -349,6 +358,21 @@ public:
sample_ids_.insert(std::istream_iterator<std::string>(ifs), std::istream_iterator<std::string>());
break;
}
+ else if (long_opt_str == "min-block-size")
+ {
+ min_ratio_ = std::max(1ll, std::atoll(optarg ? optarg : ""));
+ break;
+ }
+ else if (long_opt_str == "max-block-size")
+ {
+ min_ratio_ = std::max(1ll, std::atoll(optarg ? optarg : ""));
+ break;
+ }
+ else if (long_opt_str == "slope-unit")
+ {
+ min_ratio_ = std::max(1ll, std::atoll(optarg ? optarg : ""));
+ break;
+ }
else if (long_opt_str == "haps")
{
std::cerr << "Warning: --haps is deprecated\n";
=====================================
src/unique_haplotype.cpp
=====================================
@@ -282,13 +282,19 @@ int unique_haplotype_block::deserialize(savvy::reader& input_file, savvy::varian
void unique_haplotype_block::remove_eov()
{
- for (auto it = unique_map_.begin(); it != unique_map_.end(); )
+ std::size_t eov_cnt = 0;
+ auto it = unique_map_.begin();
+ auto dest_it = it;
+ for ( ; it != unique_map_.end(); ++it)
{
if (savvy::typed_value::is_end_of_vector(*it))
- it = unique_map_.erase(it);
+ ++eov_cnt;
else
- ++it;
+ *(dest_it++) = *it;
}
+
+ if (eov_cnt)
+ unique_map_.resize(unique_map_.size() - eov_cnt);
}
bool unique_haplotype_block::deserialize(std::istream& is, int m3vcf_version, std::size_t n_haplotypes)
@@ -547,4 +553,4 @@ float reduced_haplotypes::compression_ratio() const
}
return num / denom;
-}
\ No newline at end of file
+}
View it on GitLab: https://salsa.debian.org/med-team/minimac4/-/commit/280b8344a4ae372e06a7c970ed915827ee05a4a3
--
View it on GitLab: https://salsa.debian.org/med-team/minimac4/-/commit/280b8344a4ae372e06a7c970ed915827ee05a4a3
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231208/5b5992e6/attachment-0001.htm>
More information about the debian-med-commit
mailing list