[med-svn] [Git][med-team/minimac4][upstream] New upstream version 4.1.5

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Wed Nov 15 16:14:38 GMT 2023



Étienne Mollier pushed to branch upstream at Debian Med / minimac4


Commits:
c86b0f5c by Étienne Mollier at 2023-11-15T16:53:04+01:00
New upstream version 4.1.5
- - - - -


7 changed files:

- CMakeLists.txt
- src/input_prep.cpp
- src/input_prep.hpp
- src/main.cpp
- src/prog_args.hpp
- src/recombination.cpp
- src/recombination.hpp


Changes:

=====================================
CMakeLists.txt
=====================================
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.2)
-project(minimac4 VERSION 4.1.4)
+project(minimac4 VERSION 4.1.5)
 
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose build type (Debug|Release|RelWithDebInfo|MinSizeRel)" FORCE)


=====================================
src/input_prep.cpp
=====================================
@@ -172,7 +172,10 @@ bool load_reference_haplotypes(const std::string& file_path,
   const std::unordered_set<std::string>& subset_ids,
   std::vector<target_variant>& target_sites,
   reduced_haplotypes& typed_only_reference_data,
-  reduced_haplotypes& full_reference_data)
+  reduced_haplotypes& full_reference_data,
+  genetic_map_file* map_file,
+  float min_recom,
+  float default_match_error)
 {
   savvy::reader input(file_path);
 
@@ -198,10 +201,13 @@ bool load_reference_haplotypes(const std::string& file_path,
     if (!input.read(var))
       return std::cerr << "Notice: no variant records in reference query region (" << extended_reg.chromosome() << ":" << extended_reg.from() << "-" << extended_reg.to() << ")\n", input.bad() ? false : true;
 
-    double start_cm = 0.;
+    double no_recom_prob = 1.;
+    double prev_cm = 0.;
+    uint32_t prev_ref_pos = 0;
     std::vector<std::int8_t> tmp_geno;
     unique_haplotype_block block;
     auto tar_it = target_sites.begin();
+    auto recom_it = tar_it;
     int res;
     while ((res = block.deserialize(input, var)) > 0)
     {
@@ -209,7 +215,7 @@ bool load_reference_haplotypes(const std::string& file_path,
       if (block.variants().empty() || block.variants().front().pos > extended_reg.to())
         break;
 
-      block.fill_cm_from_recom(start_cm);
+      //block.fill_cm_from_recom(start_cm); // TODO: REMOVE
 
       for (auto ref_it = block.variants().begin(); ref_it != block.variants().end(); ++ref_it)
       {
@@ -224,19 +230,43 @@ bool load_reference_haplotypes(const std::string& file_path,
             for (std::size_t i = 0; i < tmp_geno.size(); ++i)
               tmp_geno[i] = ref_it->gt[block.unique_map()[i]];
 
+            if (it->pos == recom_it->pos) // This condition is probably redundant with prev_ref_pos check
+              recom_it->recom = 0.f;
+            else
+              recom_it->recom = std::max<float>(1. - no_recom_prob, 0.f); // min_recom_s3);
+            assert(recom_it->recom <= 1.f && recom_it->recom >= 0.f);
+
             typed_only_reference_data.compress_variant({ref_it->chrom, ref_it->pos, ref_it->id, ref_it->ref, ref_it->alt, ref_it->err, ref_it->recom, ref_it->cm}, tmp_geno);
 
-            it->af = std::accumulate(tmp_geno.begin(), tmp_geno.end(), 0.f) / tmp_geno.size();
+            auto test_af = std::accumulate(tmp_geno.begin(), tmp_geno.end(), 0.f) / tmp_geno.size();
             it->af = float((--typed_only_reference_data.end())->ac) / tmp_geno.size();
+            assert(test_af == it->af);
+            it->err = std::isnan(ref_it->err) ? default_match_error : ref_it->err;
             it->in_ref = true;
 
             if (it != tar_it)
               std::swap(*it, *tar_it);
 
+            no_recom_prob = 1.;
+            recom_it = tar_it;
             ++tar_it;
             break;
           }
         }
+
+        double switch_prob = std::isnan(ref_it->recom) ? 0.f : ref_it->recom;
+
+        if (map_file)
+        {
+          double cm = map_file->interpolate_centimorgan(ref_it->pos);
+          switch_prob = recombination::cm_to_switch_prob(cm - prev_cm);
+          prev_cm = cm;
+        }
+
+        if (prev_ref_pos > 0 && ref_it->pos != prev_ref_pos)
+          no_recom_prob *= 1. - std::max<double>(switch_prob, min_recom);
+        
+        prev_ref_pos = ref_it->pos;
       }
 
       block.trim(impute_reg.from(), impute_reg.to());
@@ -244,93 +274,18 @@ bool load_reference_haplotypes(const std::string& file_path,
         full_reference_data.append_block(block);
     }
 
+    assert(recom_it != target_sites.end());
+    if (recom_it != target_sites.end())
+      recom_it->recom = 0.f;
+
     if (res < 0)
       return false;
 
-
     return !input.bad();
   }
-  else
-  {
-    shrinkwrap::gz::istream input_file(file_path);
-    std::string line;
-
-    std::uint8_t m3vcf_version = 0;
-    const std::string m3vcf_version_line = "##fileformat=M3VCF";
-    while (std::getline(input_file, line))
-    {
-      if (line.substr(0, m3vcf_version_line.size()) == m3vcf_version_line)
-      {
-        if (line == "##fileformat=M3VCFv2.0")
-          m3vcf_version = 2;
-        else
-          m3vcf_version = 1;
-        break;
-      }
-
-      if (line.size() < 2 || line[1] != '#')
-      {
-        std::cerr << "Error: invalid reference file" << std::endl;
-        return false;
-      }
-    }
-
-    std::size_t n_samples = 0;
-    while (std::getline(input_file, line))
-    {
-      if (line.size() < 2 || line[1] != '#')
-      {
-        n_samples = std::count(line.begin(), line.end(), '\t') - 8;
-        break;
-      }
-    }
-
-    double start_cm = 0.;
-    auto tar_it = target_sites.begin();
-    unique_haplotype_block block;
-    std::vector<std::int8_t> tmp_geno;
-    while (block.deserialize(input_file, m3vcf_version, m3vcf_version == 1 ? n_samples : 2 * n_samples))
-    {
-      if (block.variants().empty() || block.variants().front().pos > extended_reg.to())
-        break;
-
-      block.fill_cm_from_recom(start_cm);
-
-      for (auto ref_it = block.variants().begin(); ref_it != block.variants().end(); ++ref_it)
-      {
-        while (tar_it != target_sites.end() && tar_it->pos < ref_it->pos)
-          ++tar_it;
-
-        for (auto it = tar_it; it != target_sites.end() && it->pos == ref_it->pos; ++it)
-        {
-          if (it->ref == ref_it->ref && it->alt == ref_it->alt)
-          {
-            tmp_geno.resize(block.unique_map().size());
-            for (std::size_t i = 0; i < tmp_geno.size(); ++i)
-              tmp_geno[i] = ref_it->gt[block.unique_map()[i]];
-
-            typed_only_reference_data.compress_variant({ref_it->chrom, ref_it->pos, ref_it->id, ref_it->ref, ref_it->alt, ref_it->err, ref_it->recom, ref_it->cm}, tmp_geno);
-
-            it->af = std::accumulate(tmp_geno.begin(), tmp_geno.end(), 0.f) / tmp_geno.size();
-            it->af = float((--typed_only_reference_data.end())->ac) / tmp_geno.size();
-            it->in_ref = true;
-
-            if (it != tar_it)
-              std::swap(*it, *tar_it);
-
-            ++tar_it;
-            break;
-          }
-        }
-      }
-
-      block.trim(impute_reg.from(), impute_reg.to());
-      if (!block.variants().empty())
-        full_reference_data.append_block(block);
-    }
-  }
 
-  return !input.bad();
+  std::cerr << "Error: failed to open MVCF file" << std::endl;
+  return false;
 }
 
 // Old approach to setting recombination probs.
@@ -428,7 +383,7 @@ bool load_reference_haplotypes_old_recom_approach(const std::string& file_path,
           prev_cm = cm;
         }
         if (ref_read_cnt)
-          no_recom_prob *= 1. - prev_recom; //std::max(prev_recom, 1e-5f); // TODO: cm -> recom
+          no_recom_prob *= 1. - std::max(prev_recom, 1e-5f); // TODO: cm -> recom
         prev_recom = ref_it->cm; // TODO: cm -> recom
         ++ref_read_cnt;
       }


=====================================
src/input_prep.hpp
=====================================
@@ -17,7 +17,19 @@ bool load_reference_haplotypes(const std::string& file_path,
   const std::unordered_set<std::string>& subset_ids,
   std::vector<target_variant>& target_sites,
   reduced_haplotypes& typed_only_reference_data,
-  reduced_haplotypes& full_reference_data);
+  reduced_haplotypes& full_reference_data,
+  genetic_map_file* map_file,
+  float min_recom,
+  float default_match_error);
+
+bool load_reference_haplotypes_old_recom_approach(const std::string& file_path,
+  const savvy::genomic_region& extended_reg,
+  const savvy::genomic_region& impute_reg,
+  const std::unordered_set<std::string>& subset_ids,
+  std::vector<target_variant>& target_sites,
+  reduced_haplotypes& typed_only_reference_data,
+  reduced_haplotypes& full_reference_data,
+  genetic_map_file* map_file);
 
 std::vector<target_variant> separate_target_only_variants(std::vector<target_variant>& target_sites);
 
@@ -29,4 +41,4 @@ bool convert_old_m3vcf(const std::string& input_path, const std::string& output_
 bool compress_reference_panel(const std::string& input_path, const std::string& output_path, const std::string& map_file_path = "");
 
 
-#endif // MINIMAC4_INPUT_PREP_HPP
\ No newline at end of file
+#endif // MINIMAC4_INPUT_PREP_HPP


=====================================
src/main.cpp
=====================================
@@ -54,7 +54,8 @@ public:
     start_time = std::time(nullptr);
     reduced_haplotypes typed_only_reference_data(16, 512);
     reduced_haplotypes full_reference_data;
-    if (!load_reference_haplotypes(args.ref_path(), extended_region, impute_region, args.sample_ids(), target_sites, typed_only_reference_data, full_reference_data))
+    std::unique_ptr<genetic_map_file> mf(args.map_path().empty() ? nullptr : new genetic_map_file(args.map_path(), impute_region.chromosome()));
+    if (!load_reference_haplotypes(args.ref_path(), extended_region, impute_region, args.sample_ids(), target_sites, typed_only_reference_data, full_reference_data, mf.get(), args.min_recom(), args.error_param()))
       return std::cerr << "Error: failed loading reference haplotypes\n", false;
     std::cerr << "Loading reference haplotypes took " << record_input_time(std::difftime(std::time(nullptr), start_time)) << " seconds" << std::endl;
 
@@ -64,8 +65,8 @@ public:
     double impute_time = 0.;
     double temp_write_time = 0.;
 
-      std::list<savvy::reader> temp_files;
-      std::list<savvy::reader> temp_emp_files;
+    std::list<savvy::reader> temp_files;
+    std::list<savvy::reader> temp_emp_files;
 //    std::list<std::string> temp_files;
 //    std::list<std::string> temp_emp_files;
     full_dosages_results hmm_results;
@@ -79,7 +80,14 @@ public:
       float tar_ref_ratio = float(typed_only_reference_data.variant_size()) / float(full_reference_data.variant_size());
       std::cerr << "Typed sites to imputed sites ratio: " << tar_ref_ratio << " (" << typed_only_reference_data.variant_size() << "/" << full_reference_data.variant_size() << ")\n";
       if (tar_ref_ratio < args.min_ratio())
-        return std::cerr << "Error: not enough target variants are available to impute this chunk. The --min-ratio, --chunk, or --region options may need to be altered.\n", false;
+      {
+        std::cerr << (args.fail_min_ratio() ? "Error" : "Warning")  << ": not enough target variants are available to impute this chunk. The --min-ratio, --chunk, or --region options may need to be altered." << std::endl;
+        if (!args.fail_min_ratio())
+          std::cerr << "Warning: skipping chunk " << impute_region.chromosome() << ":" << impute_region.from() << "-" << impute_region.to() << std::endl;
+        if (args.fail_min_ratio())
+          return false;
+        return true; // skip
+      }
 
       if (target_only_sites.size())
       {
@@ -97,11 +105,11 @@ public:
       if (target_sites.empty())
         return std::cerr << "Error: no target variants\n", false;
 
-      std::cerr << "Loading switch probabilities ..." << std::endl;
-      start_time = std::time(nullptr);
-      if (!load_variant_hmm_params(target_sites, typed_only_reference_data, args.error_param(), args.min_recom(), args.map_path()))
-        return std::cerr << "Error: parsing map file failed\n", false;
-      std::cerr << "Loading switch probabilities took " << record_input_time(std::difftime(std::time(nullptr), start_time)) << " seconds" << std::endl;
+//      std::cerr << "Loading switch probabilities ..." << std::endl;
+//      start_time = std::time(nullptr);
+//      if (!load_variant_hmm_params(target_sites, typed_only_reference_data, args.error_param(), args.min_recom(), args.map_path()))
+//        return std::cerr << "Error: parsing map file failed\n", false;
+//      std::cerr << "Loading switch probabilities took " << record_input_time(std::difftime(std::time(nullptr), start_time)) << " seconds" << std::endl;
 
       auto reverse_maps = generate_reverse_maps(typed_only_reference_data);
 


=====================================
src/prog_args.hpp
=====================================
@@ -35,7 +35,7 @@ private:
   std::int16_t threads_ = 1;
   float decay_ = 0.f;
   float min_r2_ = -1.f;
-  float min_ratio_ = 1e-5f;
+  float min_ratio_ = 1e-4f;
   float prob_threshold_ = 0.01f;
   float prob_threshold_s1_ = -1.f;
   float diff_threshold_ = 0.01f;
@@ -46,6 +46,7 @@ private:
   bool compress_reference_ = false;
   bool pass_only_ = false;
   bool meta_ = false; // deprecated
+  bool fail_min_ratio_ = true;
   bool help_ = false;
   bool version_ = false;
 
@@ -81,6 +82,7 @@ public:
   bool update_m3vcf() const { return update_m3vcf_; }
   bool compress_reference() const { return compress_reference_; }
   bool pass_only() const { return pass_only_; }
+  bool fail_min_ratio() const { return fail_min_ratio_; }
 
   prog_args() :
     getopt_wrapper(
@@ -105,9 +107,10 @@ public:
         {"overlap", required_argument, 0, 'w', "Size (in base pairs) of overlap before and after impute region to use as input to HMM (default: 3000000)"},
         {"decay", required_argument, 0, '\x02', "Decay rate for dosages in flanking regions (default: disabled with 0)"},
         {"min-r2", required_argument, 0, '\x02', "Minimum estimated r-square for output variants"},
-        {"min-ratio", required_argument, 0, '\x02', "Minimum ratio of number of target sites to reference sites (default: 0.00001)"},
+        {"min-ratio", required_argument, 0, '\x02', "Minimum ratio of number of target sites to reference sites (default: 1e-4)"},
+        {"min-ratio-behavior", required_argument, 0, '\x02', "Behavior for when --min-ratio is not met (\"skip\" or \"fail\"; default: fail)"}, // maybe add "warn"
         {"match-error", required_argument, 0, '\x02', "Error parameter for HMM match probabilities (default: 0.01)"},
-        {"min-recom", required_argument, 0, '\x02', "Minimum recombination probability (default: 0.00001)"},
+        {"min-recom", required_argument, 0, '\x02', "Minimum recombination probability (default: 1e-5)"},
         {"prob-threshold", required_argument, 0, '\x02', "Probability threshold used for template selection"},
         {"prob-threshold-s1", required_argument, 0, '\x02', "Probability threshold used for template selection in original state space"},
         {"diff-threshold", required_argument, 0, '\x02', "Probability diff threshold used in template selection"},
@@ -299,6 +302,11 @@ public:
             min_ratio_ = std::min(1., std::max(0., std::atof(optarg ? optarg : "")));
             break;
           }
+          else if (long_opt_str == "min-ratio-behavior")
+          {
+            fail_min_ratio_ = std::string(optarg ? optarg : "") == "fail";
+            break;
+          }
           else if (long_opt_str == "match-error")
           {
             error_param_ = std::min(0.5, std::max(0., std::atof(optarg ? optarg : "")));
@@ -537,4 +545,4 @@ private:
   }
 };
 
-#endif // MINIMAC4_PROG_ARGS_HPP
\ No newline at end of file
+#endif // MINIMAC4_PROG_ARGS_HPP


=====================================
src/recombination.cpp
=====================================
@@ -152,14 +152,14 @@ genetic_map_file::genetic_map_file(const std::string& map_file_path, const std::
   }
 }
 
-float genetic_map_file::interpolate_centimorgan(std::size_t variant_pos)
+double genetic_map_file::interpolate_centimorgan(std::size_t variant_pos)
 {
   if (good_)
   {
     if (variant_pos < prev_rec_.pos)
     {
       auto basepair_cm = prev_rec_.map_value / double(prev_rec_.pos);
-      return float(double(variant_pos) * basepair_cm);
+      return double(double(variant_pos) * basepair_cm);
     }
 
     record temp_rec;
@@ -176,13 +176,13 @@ float genetic_map_file::interpolate_centimorgan(std::size_t variant_pos)
     if (variant_pos < cur_rec_.pos)
     {
       assert(variant_pos - prev_rec_.pos < variant_pos); //TODO: handle gracefully
-      return float(prev_rec_.map_value + double(variant_pos - prev_rec_.pos) * basepair_cm);
+      return double(prev_rec_.map_value + double(variant_pos - prev_rec_.pos) * basepair_cm);
     }
 
-    return float(cur_rec_.map_value + double(variant_pos - cur_rec_.pos) * basepair_cm); // Should we assume that basepair_cm is zero?
+    return double(cur_rec_.map_value + double(variant_pos - cur_rec_.pos) * basepair_cm); // Should we assume that basepair_cm is zero?
   }
 
-  return std::numeric_limits<float>::quiet_NaN();;
+  return std::numeric_limits<double>::quiet_NaN();;
 }
 
 bool genetic_map_file::read_record(record& entry)


=====================================
src/recombination.hpp
=====================================
@@ -56,7 +56,7 @@ public:
   operator bool() const { return good_; }
 
   // This must be called in order. The pos argument must be >= to pos passed in previous call
-  float interpolate_centimorgan(std::size_t variant_pos);
+  double interpolate_centimorgan(std::size_t variant_pos);
 private:
   bool read_record(record& rec);
 };



View it on GitLab: https://salsa.debian.org/med-team/minimac4/-/commit/c86b0f5cc17bdd9d9ddf71c4246d82ede93e0659

-- 
View it on GitLab: https://salsa.debian.org/med-team/minimac4/-/commit/c86b0f5cc17bdd9d9ddf71c4246d82ede93e0659
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231115/a7e4d00f/attachment-0001.htm>


More information about the debian-med-commit mailing list