[med-svn] [Git][med-team/minimac4][upstream] New upstream version 4.1.6

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Fri Dec 8 12:19:03 GMT 2023



Étienne Mollier pushed to branch upstream at Debian Med / minimac4


Commits:
280b8344 by Étienne Mollier at 2023-12-08T13:13:32+01:00
New upstream version 4.1.6
- - - - -


6 changed files:

- CMakeLists.txt
- src/input_prep.cpp
- src/input_prep.hpp
- src/main.cpp
- src/prog_args.hpp
- src/unique_haplotype.cpp


Changes:

=====================================
CMakeLists.txt
=====================================
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.2)
-project(minimac4 VERSION 4.1.5)
+project(minimac4 VERSION 4.1.6)
 
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose build type (Debug|Release|RelWithDebInfo|MinSizeRel)" FORCE)


=====================================
src/input_prep.cpp
=====================================
@@ -705,9 +705,15 @@ bool convert_old_m3vcf(const std::string& input_path, const std::string& output_
   return !input_file.bad() && output_file.good();
 }
 
-bool compress_reference_panel(const std::string& input_path, const std::string& output_path, const std::string& map_file_path)
+bool compress_reference_panel(const std::string& input_path, const std::string& output_path,
+  std::size_t min_block_size,
+  std::size_t max_block_size,
+  std::size_t slope_unit,
+  const std::string& map_file_path)
 {
   savvy::reader input_file(input_path);
+  if (!input_file)
+    return std::cerr << "Error: could not open input file\n", false;
   savvy::variant var;
   std::vector<std::int8_t> gts;
 
@@ -744,35 +750,54 @@ bool compress_reference_panel(const std::string& input_path, const std::string&
       headers.emplace_back(it->first, it->second);
   }
 
-  if (!input_file.read(var) || !var.get_format("GT", gts))
-    return std::cerr << "Error: could not read GT from first variant record in input file\n", false;
+  std::string last_3;
+  if (output_path.size() >= 3)
+    last_3 = output_path.substr(output_path.size() - 3);
+  savvy::writer output_file(output_path, last_3 == "bcf" ? savvy::file::format::bcf : savvy::file::format::sav, headers, input_file.samples(), 6);
+
+  //if (!input_file.read(var))
+  //  return input_file.bad() ? std::cerr << "Error: read failure on first record\n", false : true;
+
+  //if (!var.get_format("GT", gts))
+  //  return std::cerr << "Error: could not read GT from first variant record in input file\n", false;
 
-  if (header_contigs.find(var.chrom()) == header_contigs.end())
-    headers.emplace_back("contig", "<ID=" + var.chrom() + ">");
+  //if (header_contigs.find(var.chrom()) == header_contigs.end())
+  //  headers.emplace_back("contig", "<ID=" + var.chrom() + ">");
 
   float flt_nan = std::numeric_limits<float>::quiet_NaN();
   unique_haplotype_block block;
-  block.compress_variant(reference_site_info(var.chrom(), var.pos(), var.id(), var.ref(), var.alts().size() ? var.alts()[0] : "", flt_nan, flt_nan, std::numeric_limits<double>::quiet_NaN()), gts);
-  std::size_t variant_cnt = 1;
+  //block.compress_variant(reference_site_info(var.chrom(), var.pos(), var.id(), var.ref(), var.alts().size() ? var.alts()[0] : "", flt_nan, flt_nan, std::numeric_limits<double>::quiet_NaN()), gts);
+  std::size_t variant_cnt = 0;
   std::size_t block_cnt = 0;
 
-  std::string last_3;
-  if (output_path.size() >= 3)
-    last_3 = output_path.substr(output_path.size() - 3);
-  savvy::writer output_file(output_path, last_3 == "bcf" ? savvy::file::format::bcf : savvy::file::format::sav, headers, input_file.samples(), 6);
+  //std::string last_3;
+  //if (output_path.size() >= 3)
+  //  last_3 = output_path.substr(output_path.size() - 3);
+  //savvy::writer output_file(output_path, last_3 == "bcf" ? savvy::file::format::bcf : savvy::file::format::sav, headers, input_file.samples(), 6);
 
   auto comp_ratio = [](const unique_haplotype_block& b)
   {
-    return float(b.expanded_haplotype_size() + b.unique_haplotype_size() * b.variant_size()) / float(b.expanded_haplotype_size() * b.variant_size());
+    return double(b.expanded_haplotype_size() + b.unique_haplotype_size() * b.variant_size()) / double(b.expanded_haplotype_size() * b.variant_size());
   };
 
-  const std::size_t min_block_size = 10;
-  const std::size_t max_block_size = 0xFFFF; // max s1r block size minus 1 partition record
-
-  bool flush_block = false;
+  //const std::size_t min_block_size = 400; //10;
+  //const std::size_t max_block_size = 400; //0xFFFF; // max s1r block size minus 1 partition record
+  std::string prev_chrom;
+  //std::size_t slope_unit = 10;
+  double cr_sum = 0.;
+  double cr_min = 2.;
+  double cr_max = 0.;
+  double prev_cr = 2.;
   while (input_file >> var)
   {
-    float old_cr = comp_ratio(block);
+    if (var.chrom() != prev_chrom && block.variant_size())
+    {
+      if (!block.serialize(output_file))
+        return std::cerr << "Error: serializing block failed\n", false;
+      block.clear();
+      ++block_cnt;
+    }
+
 
     if (!var.get_format("GT", gts))
       return std::cerr << "Error: could not read GT from variant record\n", false;
@@ -786,22 +811,32 @@ bool compress_reference_panel(const std::string& input_path, const std::string&
 
     ++variant_cnt;
 
+    bool flush_block = false;
     std::size_t cnt = block.variant_size();
+    double new_cr = comp_ratio(block);
     if (cnt >= min_block_size)
     {
-      float new_cr = comp_ratio(block);
-      if (new_cr > old_cr || cnt >= max_block_size)
-      {
-        // CM INFO field would need to be a double in order to have enough precision
-        // if (map_file)
-        //  block.fill_cm(*map_file);
-
-        if (!block.serialize(output_file))
-          return std::cerr << "Error: serializing block failed\n", false;
-        block.clear();
-        ++block_cnt;
-      }
+      if ((cnt % slope_unit == 0 && new_cr > prev_cr) || cnt >= max_block_size)
+        flush_block = true;
     }
+   
+    if (flush_block)
+    {
+      if (!block.serialize(output_file))
+        return std::cerr << "Error: serializing block failed\n", false;
+      block.clear();
+      ++block_cnt;
+      if (new_cr < cr_min) cr_min = new_cr;
+      if (new_cr > cr_max) cr_max = new_cr;
+      cr_sum += new_cr;
+      prev_cr=2.;
+    }
+    else if (cnt % slope_unit == 0)
+    {
+      prev_cr = new_cr;
+    }
+
+    prev_chrom = var.chrom();
   }
 
   // CM INFO field would need to be a double in order to have enough precision
@@ -810,10 +845,18 @@ bool compress_reference_panel(const std::string& input_path, const std::string&
 
   if (block.variants().size())
   {
+    double cr = comp_ratio(block);
+    if (cr < cr_min) cr_min = cr;
+    if (cr > cr_max) cr_max = cr;
+    cr_sum += comp_ratio(block);
     if (!block.serialize(output_file))
       return std::cerr << "Error: serializing final block failed\n", false;
     ++block_cnt;
   }
 
+  std::cerr << "Mean Compression Ratio: " << cr_sum / block_cnt << std::endl;
+  std::cerr << "Min Compression Ratio: " << cr_min << std::endl;
+  std::cerr << "Max Compression Ratio: " << cr_max << std::endl;
+
   return !input_file.bad() && output_file.good();
 }


=====================================
src/input_prep.hpp
=====================================
@@ -38,7 +38,11 @@ bool load_variant_hmm_params(std::vector<target_variant>& tar_variants, reduced_
 std::vector<std::vector<std::vector<std::size_t>>> generate_reverse_maps(const reduced_haplotypes& typed_only_reference_data);
 
 bool convert_old_m3vcf(const std::string& input_path, const std::string& output_path, const std::string& map_file_path = "");
-bool compress_reference_panel(const std::string& input_path, const std::string& output_path, const std::string& map_file_path = "");
+bool compress_reference_panel(const std::string& input_path, const std::string& output_path,
+  std::size_t min_block_size = 10,
+  std::size_t max_block_size = 0xFFFF, // max s1r block size minus 1 partition record
+  std::size_t slope_unit = 10, 
+  const std::string& map_file_path = "");
 
 
 #endif // MINIMAC4_INPUT_PREP_HPP


=====================================
src/main.cpp
=====================================
@@ -268,7 +268,7 @@ int main(int argc, char** argv)
     return convert_old_m3vcf(args.ref_path(), args.out_path(), args.map_path()) ? EXIT_SUCCESS : EXIT_FAILURE;
 
   if (args.compress_reference())
-    return compress_reference_panel(args.ref_path(), args.out_path(), args.map_path()) ? EXIT_SUCCESS : EXIT_FAILURE;
+    return compress_reference_panel(args.ref_path(), args.out_path(), args.min_block_size(), args.max_block_size(), args.slope_unit(), args.map_path()) ? EXIT_SUCCESS : EXIT_FAILURE;
 
   std::uint64_t end_pos = args.region().to();
   std::string chrom = args.region().chromosome();


=====================================
src/prog_args.hpp
=====================================
@@ -30,6 +30,9 @@ private:
   std::unordered_set<std::string> sample_ids_;
   savvy::genomic_region reg_ = {""};
   std::size_t temp_buffer_ = 200;
+  std::size_t min_block_size_ = 10;
+  std::size_t max_block_size_ = 0xFFFF; // max s1r block size minus 1 partition record
+  std::size_t slope_unit_ = 10;
   std::int64_t chunk_size_ = 20000000;
   std::int64_t overlap_ = 3000000;
   std::int16_t threads_ = 1;
@@ -70,6 +73,9 @@ public:
   std::int64_t overlap() const { return overlap_; }
   std::int16_t threads() const { return threads_; }
   std::size_t temp_buffer() const { return temp_buffer_ ; }
+  std::size_t min_block_size() const { return min_block_size_; }
+  std::size_t max_block_size() const { return max_block_size_; }
+  std::size_t slope_unit() const { return slope_unit_; }
   float decay() const { return decay_; }
   float min_r2() const { return min_r2_; }
   float min_ratio() const { return min_ratio_; }
@@ -119,6 +125,9 @@ public:
         {"temp-prefix", required_argument, 0, '\x02', "Prefix path for temporary output files (default: ${TMPDIR}/m4_)"},
         {"update-m3vcf", no_argument, 0, '\x01', "Converts M3VCF to MVCF (default output: /dev/stdout)"},
         {"compress-reference", no_argument, 0, '\x01', "Compresses VCF to MVCF (default output: /dev/stdout)"},
+        {"min-block-size", required_argument, 0, '\x02', "Minimium block size for unique haplotype compression (default: 10)"},
+        {"max-block-size", required_argument, 0, '\x02', "Maximum block size for unique haplotype compression (default: 65535)"},
+        {"slope-unit", required_argument, 0, '\x02', "Parameter for unique haplotype compression heuristic (default: 10)"},
         // vvvv deprecated vvvv //
         {"allTypedSites", no_argument, 0, '\x01', nullptr},
         {"rsid", no_argument, 0, '\x01', nullptr},
@@ -349,6 +358,21 @@ public:
             sample_ids_.insert(std::istream_iterator<std::string>(ifs), std::istream_iterator<std::string>());
             break;
           }
+          else if (long_opt_str == "min-block-size")
+          {
+            min_ratio_ = std::max(1ll, std::atoll(optarg ? optarg : ""));
+            break;
+          }
+          else if (long_opt_str == "max-block-size")
+          {
+            min_ratio_ = std::max(1ll, std::atoll(optarg ? optarg : ""));
+            break;
+          }
+          else if (long_opt_str == "slope-unit")
+          {
+            min_ratio_ = std::max(1ll, std::atoll(optarg ? optarg : ""));
+            break;
+          }
           else if (long_opt_str == "haps")
           {
             std::cerr << "Warning: --haps is deprecated\n";


=====================================
src/unique_haplotype.cpp
=====================================
@@ -282,13 +282,19 @@ int unique_haplotype_block::deserialize(savvy::reader& input_file, savvy::varian
 
 void unique_haplotype_block::remove_eov()
 {
-  for (auto it = unique_map_.begin(); it != unique_map_.end(); )
+  std::size_t eov_cnt = 0;
+  auto it = unique_map_.begin();
+  auto dest_it = it;
+  for ( ; it != unique_map_.end(); ++it)
   {
     if (savvy::typed_value::is_end_of_vector(*it))
-      it = unique_map_.erase(it);
+      ++eov_cnt;
     else
-      ++it;
+      *(dest_it++) = *it;
   }
+
+  if (eov_cnt)
+    unique_map_.resize(unique_map_.size() - eov_cnt);
 }
 
 bool unique_haplotype_block::deserialize(std::istream& is, int m3vcf_version, std::size_t n_haplotypes)
@@ -547,4 +553,4 @@ float reduced_haplotypes::compression_ratio() const
   }
 
   return num / denom;
-}
\ No newline at end of file
+}



View it on GitLab: https://salsa.debian.org/med-team/minimac4/-/commit/280b8344a4ae372e06a7c970ed915827ee05a4a3

-- 
View it on GitLab: https://salsa.debian.org/med-team/minimac4/-/commit/280b8344a4ae372e06a7c970ed915827ee05a4a3
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231208/5b5992e6/attachment-0001.htm>


More information about the debian-med-commit mailing list