[med-svn] [Git][med-team/minimac4][upstream] New upstream version 4.1.5
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Wed Nov 15 16:14:38 GMT 2023
Étienne Mollier pushed to branch upstream at Debian Med / minimac4
Commits:
c86b0f5c by Étienne Mollier at 2023-11-15T16:53:04+01:00
New upstream version 4.1.5
- - - - -
7 changed files:
- CMakeLists.txt
- src/input_prep.cpp
- src/input_prep.hpp
- src/main.cpp
- src/prog_args.hpp
- src/recombination.cpp
- src/recombination.hpp
Changes:
=====================================
CMakeLists.txt
=====================================
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.2)
-project(minimac4 VERSION 4.1.4)
+project(minimac4 VERSION 4.1.5)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose build type (Debug|Release|RelWithDebInfo|MinSizeRel)" FORCE)
=====================================
src/input_prep.cpp
=====================================
@@ -172,7 +172,10 @@ bool load_reference_haplotypes(const std::string& file_path,
const std::unordered_set<std::string>& subset_ids,
std::vector<target_variant>& target_sites,
reduced_haplotypes& typed_only_reference_data,
- reduced_haplotypes& full_reference_data)
+ reduced_haplotypes& full_reference_data,
+ genetic_map_file* map_file,
+ float min_recom,
+ float default_match_error)
{
savvy::reader input(file_path);
@@ -198,10 +201,13 @@ bool load_reference_haplotypes(const std::string& file_path,
if (!input.read(var))
return std::cerr << "Notice: no variant records in reference query region (" << extended_reg.chromosome() << ":" << extended_reg.from() << "-" << extended_reg.to() << ")\n", input.bad() ? false : true;
- double start_cm = 0.;
+ double no_recom_prob = 1.;
+ double prev_cm = 0.;
+ uint32_t prev_ref_pos = 0;
std::vector<std::int8_t> tmp_geno;
unique_haplotype_block block;
auto tar_it = target_sites.begin();
+ auto recom_it = tar_it;
int res;
while ((res = block.deserialize(input, var)) > 0)
{
@@ -209,7 +215,7 @@ bool load_reference_haplotypes(const std::string& file_path,
if (block.variants().empty() || block.variants().front().pos > extended_reg.to())
break;
- block.fill_cm_from_recom(start_cm);
+ //block.fill_cm_from_recom(start_cm); // TODO: REMOVE
for (auto ref_it = block.variants().begin(); ref_it != block.variants().end(); ++ref_it)
{
@@ -224,19 +230,43 @@ bool load_reference_haplotypes(const std::string& file_path,
for (std::size_t i = 0; i < tmp_geno.size(); ++i)
tmp_geno[i] = ref_it->gt[block.unique_map()[i]];
+ if (it->pos == recom_it->pos) // This condition is probably redundant with prev_ref_pos check
+ recom_it->recom = 0.f;
+ else
+ recom_it->recom = std::max<float>(1. - no_recom_prob, 0.f); // min_recom_s3);
+ assert(recom_it->recom <= 1.f && recom_it->recom >= 0.f);
+
typed_only_reference_data.compress_variant({ref_it->chrom, ref_it->pos, ref_it->id, ref_it->ref, ref_it->alt, ref_it->err, ref_it->recom, ref_it->cm}, tmp_geno);
- it->af = std::accumulate(tmp_geno.begin(), tmp_geno.end(), 0.f) / tmp_geno.size();
+ auto test_af = std::accumulate(tmp_geno.begin(), tmp_geno.end(), 0.f) / tmp_geno.size();
it->af = float((--typed_only_reference_data.end())->ac) / tmp_geno.size();
+ assert(test_af == it->af);
+ it->err = std::isnan(ref_it->err) ? default_match_error : ref_it->err;
it->in_ref = true;
if (it != tar_it)
std::swap(*it, *tar_it);
+ no_recom_prob = 1.;
+ recom_it = tar_it;
++tar_it;
break;
}
}
+
+ double switch_prob = std::isnan(ref_it->recom) ? 0.f : ref_it->recom;
+
+ if (map_file)
+ {
+ double cm = map_file->interpolate_centimorgan(ref_it->pos);
+ switch_prob = recombination::cm_to_switch_prob(cm - prev_cm);
+ prev_cm = cm;
+ }
+
+ if (prev_ref_pos > 0 && ref_it->pos != prev_ref_pos)
+ no_recom_prob *= 1. - std::max<double>(switch_prob, min_recom);
+
+ prev_ref_pos = ref_it->pos;
}
block.trim(impute_reg.from(), impute_reg.to());
@@ -244,93 +274,18 @@ bool load_reference_haplotypes(const std::string& file_path,
full_reference_data.append_block(block);
}
+ assert(recom_it != target_sites.end());
+ if (recom_it != target_sites.end())
+ recom_it->recom = 0.f;
+
if (res < 0)
return false;
-
return !input.bad();
}
- else
- {
- shrinkwrap::gz::istream input_file(file_path);
- std::string line;
-
- std::uint8_t m3vcf_version = 0;
- const std::string m3vcf_version_line = "##fileformat=M3VCF";
- while (std::getline(input_file, line))
- {
- if (line.substr(0, m3vcf_version_line.size()) == m3vcf_version_line)
- {
- if (line == "##fileformat=M3VCFv2.0")
- m3vcf_version = 2;
- else
- m3vcf_version = 1;
- break;
- }
-
- if (line.size() < 2 || line[1] != '#')
- {
- std::cerr << "Error: invalid reference file" << std::endl;
- return false;
- }
- }
-
- std::size_t n_samples = 0;
- while (std::getline(input_file, line))
- {
- if (line.size() < 2 || line[1] != '#')
- {
- n_samples = std::count(line.begin(), line.end(), '\t') - 8;
- break;
- }
- }
-
- double start_cm = 0.;
- auto tar_it = target_sites.begin();
- unique_haplotype_block block;
- std::vector<std::int8_t> tmp_geno;
- while (block.deserialize(input_file, m3vcf_version, m3vcf_version == 1 ? n_samples : 2 * n_samples))
- {
- if (block.variants().empty() || block.variants().front().pos > extended_reg.to())
- break;
-
- block.fill_cm_from_recom(start_cm);
-
- for (auto ref_it = block.variants().begin(); ref_it != block.variants().end(); ++ref_it)
- {
- while (tar_it != target_sites.end() && tar_it->pos < ref_it->pos)
- ++tar_it;
-
- for (auto it = tar_it; it != target_sites.end() && it->pos == ref_it->pos; ++it)
- {
- if (it->ref == ref_it->ref && it->alt == ref_it->alt)
- {
- tmp_geno.resize(block.unique_map().size());
- for (std::size_t i = 0; i < tmp_geno.size(); ++i)
- tmp_geno[i] = ref_it->gt[block.unique_map()[i]];
-
- typed_only_reference_data.compress_variant({ref_it->chrom, ref_it->pos, ref_it->id, ref_it->ref, ref_it->alt, ref_it->err, ref_it->recom, ref_it->cm}, tmp_geno);
-
- it->af = std::accumulate(tmp_geno.begin(), tmp_geno.end(), 0.f) / tmp_geno.size();
- it->af = float((--typed_only_reference_data.end())->ac) / tmp_geno.size();
- it->in_ref = true;
-
- if (it != tar_it)
- std::swap(*it, *tar_it);
-
- ++tar_it;
- break;
- }
- }
- }
-
- block.trim(impute_reg.from(), impute_reg.to());
- if (!block.variants().empty())
- full_reference_data.append_block(block);
- }
- }
- return !input.bad();
+ std::cerr << "Error: failed to open MVCF file" << std::endl;
+ return false;
}
// Old approach to setting recombination probs.
@@ -428,7 +383,7 @@ bool load_reference_haplotypes_old_recom_approach(const std::string& file_path,
prev_cm = cm;
}
if (ref_read_cnt)
- no_recom_prob *= 1. - prev_recom; //std::max(prev_recom, 1e-5f); // TODO: cm -> recom
+ no_recom_prob *= 1. - std::max(prev_recom, 1e-5f); // TODO: cm -> recom
prev_recom = ref_it->cm; // TODO: cm -> recom
++ref_read_cnt;
}
=====================================
src/input_prep.hpp
=====================================
@@ -17,7 +17,19 @@ bool load_reference_haplotypes(const std::string& file_path,
const std::unordered_set<std::string>& subset_ids,
std::vector<target_variant>& target_sites,
reduced_haplotypes& typed_only_reference_data,
- reduced_haplotypes& full_reference_data);
+ reduced_haplotypes& full_reference_data,
+ genetic_map_file* map_file,
+ float min_recom,
+ float default_match_error);
+
+bool load_reference_haplotypes_old_recom_approach(const std::string& file_path,
+ const savvy::genomic_region& extended_reg,
+ const savvy::genomic_region& impute_reg,
+ const std::unordered_set<std::string>& subset_ids,
+ std::vector<target_variant>& target_sites,
+ reduced_haplotypes& typed_only_reference_data,
+ reduced_haplotypes& full_reference_data,
+ genetic_map_file* map_file);
std::vector<target_variant> separate_target_only_variants(std::vector<target_variant>& target_sites);
@@ -29,4 +41,4 @@ bool convert_old_m3vcf(const std::string& input_path, const std::string& output_
bool compress_reference_panel(const std::string& input_path, const std::string& output_path, const std::string& map_file_path = "");
-#endif // MINIMAC4_INPUT_PREP_HPP
\ No newline at end of file
+#endif // MINIMAC4_INPUT_PREP_HPP
=====================================
src/main.cpp
=====================================
@@ -54,7 +54,8 @@ public:
start_time = std::time(nullptr);
reduced_haplotypes typed_only_reference_data(16, 512);
reduced_haplotypes full_reference_data;
- if (!load_reference_haplotypes(args.ref_path(), extended_region, impute_region, args.sample_ids(), target_sites, typed_only_reference_data, full_reference_data))
+ std::unique_ptr<genetic_map_file> mf(args.map_path().empty() ? nullptr : new genetic_map_file(args.map_path(), impute_region.chromosome()));
+ if (!load_reference_haplotypes(args.ref_path(), extended_region, impute_region, args.sample_ids(), target_sites, typed_only_reference_data, full_reference_data, mf.get(), args.min_recom(), args.error_param()))
return std::cerr << "Error: failed loading reference haplotypes\n", false;
std::cerr << "Loading reference haplotypes took " << record_input_time(std::difftime(std::time(nullptr), start_time)) << " seconds" << std::endl;
@@ -64,8 +65,8 @@ public:
double impute_time = 0.;
double temp_write_time = 0.;
- std::list<savvy::reader> temp_files;
- std::list<savvy::reader> temp_emp_files;
+ std::list<savvy::reader> temp_files;
+ std::list<savvy::reader> temp_emp_files;
// std::list<std::string> temp_files;
// std::list<std::string> temp_emp_files;
full_dosages_results hmm_results;
@@ -79,7 +80,14 @@ public:
float tar_ref_ratio = float(typed_only_reference_data.variant_size()) / float(full_reference_data.variant_size());
std::cerr << "Typed sites to imputed sites ratio: " << tar_ref_ratio << " (" << typed_only_reference_data.variant_size() << "/" << full_reference_data.variant_size() << ")\n";
if (tar_ref_ratio < args.min_ratio())
- return std::cerr << "Error: not enough target variants are available to impute this chunk. The --min-ratio, --chunk, or --region options may need to be altered.\n", false;
+ {
+ std::cerr << (args.fail_min_ratio() ? "Error" : "Warning") << ": not enough target variants are available to impute this chunk. The --min-ratio, --chunk, or --region options may need to be altered." << std::endl;
+ if (!args.fail_min_ratio())
+ std::cerr << "Warning: skipping chunk " << impute_region.chromosome() << ":" << impute_region.from() << "-" << impute_region.to() << std::endl;
+ if (args.fail_min_ratio())
+ return false;
+ return true; // skip
+ }
if (target_only_sites.size())
{
@@ -97,11 +105,11 @@ public:
if (target_sites.empty())
return std::cerr << "Error: no target variants\n", false;
- std::cerr << "Loading switch probabilities ..." << std::endl;
- start_time = std::time(nullptr);
- if (!load_variant_hmm_params(target_sites, typed_only_reference_data, args.error_param(), args.min_recom(), args.map_path()))
- return std::cerr << "Error: parsing map file failed\n", false;
- std::cerr << "Loading switch probabilities took " << record_input_time(std::difftime(std::time(nullptr), start_time)) << " seconds" << std::endl;
+// std::cerr << "Loading switch probabilities ..." << std::endl;
+// start_time = std::time(nullptr);
+// if (!load_variant_hmm_params(target_sites, typed_only_reference_data, args.error_param(), args.min_recom(), args.map_path()))
+// return std::cerr << "Error: parsing map file failed\n", false;
+// std::cerr << "Loading switch probabilities took " << record_input_time(std::difftime(std::time(nullptr), start_time)) << " seconds" << std::endl;
auto reverse_maps = generate_reverse_maps(typed_only_reference_data);
=====================================
src/prog_args.hpp
=====================================
@@ -35,7 +35,7 @@ private:
std::int16_t threads_ = 1;
float decay_ = 0.f;
float min_r2_ = -1.f;
- float min_ratio_ = 1e-5f;
+ float min_ratio_ = 1e-4f;
float prob_threshold_ = 0.01f;
float prob_threshold_s1_ = -1.f;
float diff_threshold_ = 0.01f;
@@ -46,6 +46,7 @@ private:
bool compress_reference_ = false;
bool pass_only_ = false;
bool meta_ = false; // deprecated
+ bool fail_min_ratio_ = true;
bool help_ = false;
bool version_ = false;
@@ -81,6 +82,7 @@ public:
bool update_m3vcf() const { return update_m3vcf_; }
bool compress_reference() const { return compress_reference_; }
bool pass_only() const { return pass_only_; }
+ bool fail_min_ratio() const { return fail_min_ratio_; }
prog_args() :
getopt_wrapper(
@@ -105,9 +107,10 @@ public:
{"overlap", required_argument, 0, 'w', "Size (in base pairs) of overlap before and after impute region to use as input to HMM (default: 3000000)"},
{"decay", required_argument, 0, '\x02', "Decay rate for dosages in flanking regions (default: disabled with 0)"},
{"min-r2", required_argument, 0, '\x02', "Minimum estimated r-square for output variants"},
- {"min-ratio", required_argument, 0, '\x02', "Minimum ratio of number of target sites to reference sites (default: 0.00001)"},
+ {"min-ratio", required_argument, 0, '\x02', "Minimum ratio of number of target sites to reference sites (default: 1e-4)"},
+ {"min-ratio-behavior", required_argument, 0, '\x02', "Behavior for when --min-ratio is not met (\"skip\" or \"fail\"; default: fail)"}, // maybe add "warn"
{"match-error", required_argument, 0, '\x02', "Error parameter for HMM match probabilities (default: 0.01)"},
- {"min-recom", required_argument, 0, '\x02', "Minimum recombination probability (default: 0.00001)"},
+ {"min-recom", required_argument, 0, '\x02', "Minimum recombination probability (default: 1e-5)"},
{"prob-threshold", required_argument, 0, '\x02', "Probability threshold used for template selection"},
{"prob-threshold-s1", required_argument, 0, '\x02', "Probability threshold used for template selection in original state space"},
{"diff-threshold", required_argument, 0, '\x02', "Probability diff threshold used in template selection"},
@@ -299,6 +302,11 @@ public:
min_ratio_ = std::min(1., std::max(0., std::atof(optarg ? optarg : "")));
break;
}
+ else if (long_opt_str == "min-ratio-behavior")
+ {
+ fail_min_ratio_ = std::string(optarg ? optarg : "") == "fail";
+ break;
+ }
else if (long_opt_str == "match-error")
{
error_param_ = std::min(0.5, std::max(0., std::atof(optarg ? optarg : "")));
@@ -537,4 +545,4 @@ private:
}
};
-#endif // MINIMAC4_PROG_ARGS_HPP
\ No newline at end of file
+#endif // MINIMAC4_PROG_ARGS_HPP
=====================================
src/recombination.cpp
=====================================
@@ -152,14 +152,14 @@ genetic_map_file::genetic_map_file(const std::string& map_file_path, const std::
}
}
-float genetic_map_file::interpolate_centimorgan(std::size_t variant_pos)
+double genetic_map_file::interpolate_centimorgan(std::size_t variant_pos)
{
if (good_)
{
if (variant_pos < prev_rec_.pos)
{
auto basepair_cm = prev_rec_.map_value / double(prev_rec_.pos);
- return float(double(variant_pos) * basepair_cm);
+ return double(double(variant_pos) * basepair_cm);
}
record temp_rec;
@@ -176,13 +176,13 @@ float genetic_map_file::interpolate_centimorgan(std::size_t variant_pos)
if (variant_pos < cur_rec_.pos)
{
assert(variant_pos - prev_rec_.pos < variant_pos); //TODO: handle gracefully
- return float(prev_rec_.map_value + double(variant_pos - prev_rec_.pos) * basepair_cm);
+ return double(prev_rec_.map_value + double(variant_pos - prev_rec_.pos) * basepair_cm);
}
- return float(cur_rec_.map_value + double(variant_pos - cur_rec_.pos) * basepair_cm); // Should we assume that basepair_cm is zero?
+ return double(cur_rec_.map_value + double(variant_pos - cur_rec_.pos) * basepair_cm); // Should we assume that basepair_cm is zero?
}
- return std::numeric_limits<float>::quiet_NaN();;
+ return std::numeric_limits<double>::quiet_NaN();;
}
bool genetic_map_file::read_record(record& entry)
=====================================
src/recombination.hpp
=====================================
@@ -56,7 +56,7 @@ public:
operator bool() const { return good_; }
// This must be called in order. The pos argument must be >= to pos passed in previous call
- float interpolate_centimorgan(std::size_t variant_pos);
+ double interpolate_centimorgan(std::size_t variant_pos);
private:
bool read_record(record& rec);
};
View it on GitLab: https://salsa.debian.org/med-team/minimac4/-/commit/c86b0f5cc17bdd9d9ddf71c4246d82ede93e0659
--
View it on GitLab: https://salsa.debian.org/med-team/minimac4/-/commit/c86b0f5cc17bdd9d9ddf71c4246d82ede93e0659
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231115/a7e4d00f/attachment-0001.htm>
More information about the debian-med-commit
mailing list