[med-svn] [Git][med-team/nanopolish][upstream] New upstream version 0.13.1

Thu Apr 23 12:59:20 BST 2020


Michael R. Crusoe pushed to branch upstream at Debian Med / nanopolish


Commits:
6e9c6932 by Michael R. Crusoe at 2020-04-23T13:51:09+02:00
New upstream version 0.13.1
- - - - -


11 changed files:

- Makefile
- README.md
- src/alignment/nanopolish_alignment_db.cpp
- src/common/nanopolish_common.h
- src/common/nanopolish_variant.cpp
- src/hmm/nanopolish_profile_hmm_r9.cpp
- src/hmm/nanopolish_profile_hmm_r9.inl
- src/io/nanopolish_fast5_io.cpp
- src/nanopolish_call_variants.cpp
- src/nanopolish_index.cpp
- src/nanopolish_squiggle_read.cpp


Changes:

=====================================
Makefile
=====================================
@@ -105,7 +105,7 @@ htslib/libhts.a:
 	$(MAKE) -C htslib htslib_default_libs="-lz -lm -lbz2" || exit 255
 
 minimap2/libminimap2.a:
-	$(MAKE) -C minimap2 $(MINIMAP2_OPT) || exit 255
+	$(MAKE) -C minimap2 $(MINIMAP2_OPT) libminimap2.a || exit 255
 
 #
 # If this library is a dependency the user wants HDF5 to be downloaded and built.


=====================================
README.md
=====================================
@@ -6,6 +6,20 @@ Software package for signal-level analysis of Oxford Nanopore sequencing data. N
 
 ## Release notes
 
+* 0.13.1: fix `nanopolish index` performance issue for some barcoding runs
+
+* 0.13.0: modify HMM transitions to allow the balance between insertions and deletions to be changed depending on mode (consensus vs reference variants)
+
+* 0.12.5: make SupportFractionByStrand calculation consistent with SupportFraction
+
+* 0.12.4: add SupportFractionByStrand and SOR to VCF
+
+* 0.12.3: fix hdf5 file handle leak
+
+* 0.12.2: add RefContext info to VCF output of `nanopolish variants`
+
+* 0.12.1: improve how `nanopolish index` handles summary files, add support for selecting reads by BAM read group tags (for `nanopolish variants`)
+
 * 0.12.0: live methylation calling, methylation LLR threshold changes as described [here](http://simpsonlab.github.io/2020/03/03/nanopolish-v0.12.0/)
 
 * 0.11.1: `nanopolish polya` now supports SQK-RNA-002 kits with automatic backwards-compatibility with SQK-RNA-001


=====================================
src/alignment/nanopolish_alignment_db.cpp
=====================================
@@ -448,7 +448,10 @@ BamHandles _initialize_bam_itr(const std::string& bam_filename,
 
     // load bam file
     handles.bam_fh = sam_open(bam_filename.c_str(), "r");
-    assert(handles.bam_fh != NULL);
+    if(handles.bam_fh == NULL) {
+        fprintf(stderr, "Error - could not open bam file %s for read\n", bam_filename.c_str());
+        exit(EXIT_FAILURE);
+    }
 
     // load bam index file
     hts_idx_t* bam_idx = sam_index_load(handles.bam_fh, bam_filename.c_str());


=====================================
src/common/nanopolish_common.h
=====================================
@@ -18,7 +18,7 @@
 #include "logsum.h"
 
 #define PACKAGE_NAME "nanopolish"
-#define PACKAGE_VERSION "0.12.1"
+#define PACKAGE_VERSION "0.13.1"
 #define PACKAGE_BUGREPORT "https://github.com/jts/nanopolish/issues"
 
 //


=====================================
src/common/nanopolish_variant.cpp
=====================================
@@ -260,6 +260,21 @@ void score_variant_group(VariantGroup& variant_group,
 #endif
 }
 
+double calculate_sor(double ref_fwd, double ref_rev, double alt_fwd, double alt_rev)
+{
+    // to avoid zeros
+    ref_fwd += 1;
+    ref_rev += 1;
+    alt_fwd += 1;
+    alt_rev += 1;
+
+    double r = (ref_fwd * alt_rev) / (alt_fwd * ref_rev);
+    double sym_ratio = r + (1.0/r);
+    double ref_ratio = std::min(ref_fwd, ref_rev) / std::max(ref_fwd, ref_rev);
+    double alt_ratio = std::min(alt_fwd, alt_rev) / std::max(alt_fwd, alt_rev);
+    return log(sym_ratio) + log(ref_ratio) - log(alt_ratio);
+}
+
 std::vector<Variant> simple_call(VariantGroup& variant_group,
                                  const int ploidy,
                                  const bool genotype_all_input_variants)
@@ -290,7 +305,7 @@ std::vector<Variant> simple_call(VariantGroup& variant_group,
 
         // The current combination is represented as a vector of haplotype IDs
         std::vector<size_t> current_set = vc_sets.get();
-        
+
         // Check if the current set consists of entirely of haplotypes without variants
         bool is_base_set = true;
         for(size_t i = 0; i < current_set.size(); ++i) {
@@ -324,7 +339,7 @@ std::vector<Variant> simple_call(VariantGroup& variant_group,
             */
             set_score += set_sum;
         }
-        
+
         if(is_base_set) {
             base_score = set_score;
             base_set = current_set;
@@ -383,7 +398,7 @@ std::vector<Variant> simple_call(VariantGroup& variant_group,
         }
     }
 
-    std::vector<std::vector<int> > allele_strand_support;
+    std::vector<std::vector<double> > allele_strand_support;
     for(size_t var_idx = 0; var_idx < variant_group.get_num_variants(); ++var_idx) {
         allele_strand_support.push_back({0, 0, 0, 0});
 
@@ -391,10 +406,9 @@ std::vector<Variant> simple_call(VariantGroup& variant_group,
             const std::string& read_id = group_reads[ri].first;
             size_t strand = variant_group.is_read_rc(read_id);
             double pp_alt = get(read_variant_assignment, ri, var_idx);
-            size_t alt_support = pp_alt > 0.5;
 
-            size_t idx = 2 * alt_support + strand;
-            allele_strand_support[var_idx][idx] += 1;
+            allele_strand_support[var_idx][0 + strand] += (1 - pp_alt);
+            allele_strand_support[var_idx][2 + strand] += pp_alt;
         }
     }
 
@@ -424,19 +438,39 @@ std::vector<Variant> simple_call(VariantGroup& variant_group,
         v.add_info("AlleleCount", var_count);
         v.add_info("SupportFraction", read_variant_support[vi] / group_reads.size());
 
+        double ref_fwd = allele_strand_support[vi][0];
+        double ref_rev = allele_strand_support[vi][1];
+
+        double alt_fwd = allele_strand_support[vi][2];
+        double alt_rev = allele_strand_support[vi][3];
+
+        std::stringstream sfbs;
+        sfbs << ((double)alt_fwd / (ref_fwd + alt_fwd)) << ","
+             << ((double)alt_rev / (ref_rev + alt_rev));
+        v.add_info("SupportFractionByStrand", sfbs.str());
+
         std::stringstream ssss;
-        ssss << allele_strand_support[vi][0] << "," << allele_strand_support[vi][1] << ","
-             << allele_strand_support[vi][2] << "," << allele_strand_support[vi][3];
+        ssss << (int)std::round(ref_fwd) << "," << (int)std::round(ref_rev) << ","
+             << (int)std::round(alt_fwd) << "," << (int)std::round(alt_rev);
         v.add_info("StrandSupport", ssss.str());
 
         double left, right, two;
-        kt_fisher_exact(allele_strand_support[vi][0], allele_strand_support[vi][1],
-                        allele_strand_support[vi][2], allele_strand_support[vi][3],
+        kt_fisher_exact(ref_fwd, ref_rev,
+                        alt_fwd, alt_rev,
                         &left, &right, &two);
 
         int fisher_scaled = (int)(-4.343 * log(two) + .499);
+
+        // clamp when there's underflow
+        if(fisher_scaled < 0) {
+            fisher_scaled = 1000;
+        }
         v.add_info("StrandFisherTest", fisher_scaled);
 
+        // GATK StrandOddsRatio from: https://gatk.broadinstitute.org/hc/en-us/articles/360036732071-StrandOddsRatio
+        double sor = calculate_sor(ref_fwd, ref_rev, alt_fwd, alt_rev);
+        v.add_info("SOR", sor);
+
         /*
         if(fisher_scaled > 30) {
             v.filter = "StrandBias";


=====================================
src/hmm/nanopolish_profile_hmm_r9.cpp
=====================================
@@ -11,6 +11,13 @@
 //#define DEBUG_FILL
 //#define PRINT_TRAINING_MESSAGES 1
 
+// nanopolish consensus is designed to correct
+// draft assemblies, which tend to have more deletions
+// than insertions, so the default HMM parameters favor insertions.
+// this doesn't hold for other modes (e.g. reference-based variant calling) so we
+// provide this value to change the deletion/insertion balance
+double hmm_indel_bias_factor = 1.0;
+
 void profile_hmm_forward_initialize_r9(FloatMatrix& fm)
 {
     // initialize forward calculation


=====================================
src/hmm/nanopolish_profile_hmm_r9.inl
=====================================
@@ -12,11 +12,15 @@
 #define TRANS_CLIP_SELF 0.9
 #define TRANS_START_TO_CLIP 0.5
 
+extern double hmm_indel_bias_factor;
+
 inline std::vector<BlockTransitions> calculate_transitions(uint32_t num_kmers, const HMMInputSequence& sequence, const HMMInputData& data)
 {
     std::vector<BlockTransitions> transitions(num_kmers);
-    
+
     double read_events_per_base = data.read->events_per_base[data.strand];
+    read_events_per_base *= hmm_indel_bias_factor;
+    read_events_per_base = std::max(1.25, read_events_per_base);
 
     for(uint32_t ki = 0; ki < num_kmers; ++ki) {
 


=====================================
src/io/nanopolish_fast5_io.cpp
=====================================
@@ -244,8 +244,9 @@ uint64_t fast5_get_start_time(fast5_file& fh, const std::string& read_id)
 #endif
         return 0;
     }
-
-    return fast5_read_uint64_attribute(group, "start_time");
+    uint64_t t = fast5_read_uint64_attribute(group, "start_time");
+    H5Gclose(group);
+    return t;
 }
 
 //


=====================================
src/nanopolish_call_variants.cpp
=====================================
@@ -89,6 +89,8 @@ static const char *CONSENSUS_USAGE_MESSAGE =
 "  -o, --outfile=FILE                   write result to FILE [default: stdout]\n"
 "  -t, --threads=NUM                    use NUM threads (default: 1)\n"
 "  -m, --min-candidate-frequency=F      extract candidate variants from the aligned reads when the variant frequency is at least F (default 0.2)\n"
+"  -i, --indel-bias=F                   bias HMM transition parameters to favor insertions (F<1) or deletions (F>1).\n"
+"                                       this value is automatically set depending on --consensus, but can be manually set if spurious indels are called\n"
 "  -d, --min-candidate-depth=D          extract candidate variants from the aligned reads when the depth is at least D (default: 20)\n"
 "  -x, --max-haplotypes=N               consider at most N haplotype combinations (default: 1000)\n"
 "      --min-flanking-sequence=N        distance from alignment end to calculate variants (default: 30)\n"
@@ -136,7 +138,7 @@ namespace opt
     static std::vector<std::string> methylation_types;
 }
 
-static const char* shortopts = "r:b:g:t:w:o:e:m:c:d:a:x:q:p:v";
+static const char* shortopts = "r:b:g:t:w:o:e:m:c:d:a:x:q:p:i:v";
 
 enum { OPT_HELP = 1,
        OPT_VERSION,
@@ -169,6 +171,7 @@ static const struct option longopts[] = {
     { "outfile",                   required_argument, NULL, 'o' },
     { "threads",                   required_argument, NULL, 't' },
     { "min-candidate-frequency",   required_argument, NULL, 'm' },
+    { "indel-bias",                required_argument, NULL, 'i' },
     { "min-candidate-depth",       required_argument, NULL, 'd' },
     { "max-haplotypes",            required_argument, NULL, 'x' },
     { "candidates",                required_argument, NULL, 'c' },
@@ -208,7 +211,9 @@ std::string get_single_contig_or_fail()
     }
 
     const char* name = faidx_iseq(fai, 0);
-    return std::string(name);
+    std::string ret(name);
+    fai_destroy(fai);
+    return ret;
 }
 
 int get_contig_length(const std::string& contig)
@@ -1005,10 +1010,13 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
     return called_haplotype;
 }
 
+extern double hmm_indel_bias_factor;
+
 void parse_call_variants_options(int argc, char** argv)
 {
     std::string methylation_motifs_str;
     bool die = false;
+    bool bias_override = false;
     for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
         std::istringstream arg(optarg != NULL ? optarg : "");
         switch (c) {
@@ -1019,6 +1027,7 @@ void parse_call_variants_options(int argc, char** argv)
             case 'w': arg >> opt::window; break;
             case 'o': arg >> opt::output_file; break;
             case 'm': arg >> opt::min_candidate_frequency; break;
+            case 'i': arg >> hmm_indel_bias_factor; bias_override = true; break;
             case 'd': arg >> opt::min_candidate_depth; break;
             case 'x': arg >> opt::max_haplotypes; break;
             case 'c': arg >> opt::candidates_file; break;
@@ -1102,6 +1111,11 @@ void parse_call_variants_options(int argc, char** argv)
         }
     }
 
+    // set hmm indel bias, if not overridden
+    if(!bias_override) {
+        hmm_indel_bias_factor = opt::consensus_mode ? 0.9 : 0.8;    
+    }
+
     if (die)
     {
         std::cout << "\n" << CONSENSUS_USAGE_MESSAGE;
@@ -1171,10 +1185,6 @@ int call_variants_main(int argc, char** argv)
     header_fields.push_back(Variant::make_vcf_header_key_value("nanopolish_window", polish_window.str()));
 
     //
-    header_fields.push_back(
-            Variant::make_vcf_tag_string("FILTER", "StrandBias", 1, "Integer",
-                "Variant failed the fisher strand bias test"));
-
     header_fields.push_back(
         Variant::make_vcf_tag_string("INFO", "TotalReads", 1, "Integer",
                                       "The number of event-space reads used to call the variant"));
@@ -1182,6 +1192,10 @@ int call_variants_main(int argc, char** argv)
     header_fields.push_back(
         Variant::make_vcf_tag_string("INFO", "SupportFraction", 1, "Float",
                                       "The fraction of event-space reads that support the variant"));
+    
+    header_fields.push_back(
+            Variant::make_vcf_tag_string("INFO", "SupportFractionByStrand", 2, "Float",
+                "Fraction of event-space reads that support the variant for each strand"));
 
     header_fields.push_back(
         Variant::make_vcf_tag_string("INFO", "BaseCalledReadsWithVariant", 1, "Integer",
@@ -1194,14 +1208,22 @@ int call_variants_main(int argc, char** argv)
     header_fields.push_back(
             Variant::make_vcf_tag_string("INFO", "AlleleCount", 1, "Integer",
                 "The inferred number of copies of the allele"));
-    
+
     header_fields.push_back(
             Variant::make_vcf_tag_string("INFO", "StrandSupport", 4, "Integer",
                 "Number of reads supporting the REF and ALT allele, by strand"));
-    
+
     header_fields.push_back(
             Variant::make_vcf_tag_string("INFO", "StrandFisherTest", 1, "Integer",
                 "Strand bias fisher test"));
+    
+    header_fields.push_back(
+            Variant::make_vcf_tag_string("INFO", "SOR", 1, "Float",
+                "StrandOddsRatio test from GATK"));
+
+    header_fields.push_back(
+            Variant::make_vcf_tag_string("INFO", "RefContext", 1, "String",
+                "The reference sequence context surrounding the variant call"));
 
     if(opt::calculate_all_support) {
         header_fields.push_back(
@@ -1228,12 +1250,27 @@ int call_variants_main(int argc, char** argv)
     }
 
     // write the variants
-    for(const auto& v : haplotype.get_variants()) {
+    faidx_t *fai = fai_load(opt::genome_file.c_str());
+    for(auto& v : haplotype.get_variants()) {
 
         if(!opt::snps_only || v.is_snp()) {
+            int context_start_base = v.ref_position - 5;
+            if(context_start_base < 0) {
+                context_start_base = 0;
+            }
+            int context_end_base = v.ref_position + v.ref_seq.length() + 4;
+            if(context_end_base >= contig_length) {
+                context_end_base = contig_length - 1;
+            }
+  
+            int len;
+            std::string context = get_reference_region_ts(fai, v.ref_name.c_str(), context_start_base, context_end_base, &len);
+            v.add_info("RefContext", context);
             v.write_vcf(out_fp);
         }
     }
+    fai_destroy(fai);
+    fai = NULL;
 
     //
     if(out_fp != stdout) {


=====================================
src/nanopolish_index.cpp
=====================================
@@ -306,7 +306,7 @@ void clean_fast5_map(std::multimap<std::string, std::string>& mmap)
 
     std::vector<std::string> invalid_entries;
     for(const auto& iter : fast5_read_count) {
-        if(iter.second != EXPECTED_ENTRIES_PER_FAST5) {
+        if(iter.second > EXPECTED_ENTRIES_PER_FAST5) {
             //fprintf(stderr, "warning: %s has %d entries in the summary and will be indexed the slow way\n", iter.first.c_str(), iter.second);
             invalid_entries.push_back(iter.first);
         }


=====================================
src/nanopolish_squiggle_read.cpp
=====================================
@@ -110,15 +110,17 @@ void SquiggleRead::init(const std::string& read_sequence, const Fast5Data& data,
     this->events_per_base[0] = events_per_base[1] = 0.0f;
     this->base_model[0] = this->base_model[1] = NULL;
     g_total_reads += 1;
-    assert(data.is_valid);
-    assert(data.rt.n > 0);
 
     this->read_name = data.read_name;
     this->read_sequence = read_sequence;
 
     // sometimes the basecaller will emit very short sequences, which causes problems
-    if(this->read_sequence.length() > 20) {
+    // also there can be rare issues with the signal in the fast5 and we want to skip
+    // such reads
+    if(this->read_sequence.length() > 20 && data.is_valid && data.rt.n > 0) {
         load_from_raw(data, flags);
+    } else {
+        g_bad_fast5_file += 1;
     }
 
     if(!this->events[0].empty()) {



View it on GitLab: https://salsa.debian.org/med-team/nanopolish/-/commit/6e9c69326f22176c09b9e1e33fc8f2428fdd0d78

-- 
View it on GitLab: https://salsa.debian.org/med-team/nanopolish/-/commit/6e9c69326f22176c09b9e1e33fc8f2428fdd0d78
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200423/70467385/attachment-0001.html>