[med-svn] [Git][med-team/nanopolish][upstream] New upstream version 0.13.1
Michael R. Crusoe
gitlab at salsa.debian.org
Thu Apr 23 12:59:20 BST 2020
Michael R. Crusoe pushed to branch upstream at Debian Med / nanopolish
Commits:
6e9c6932 by Michael R. Crusoe at 2020-04-23T13:51:09+02:00
New upstream version 0.13.1
- - - - -
11 changed files:
- Makefile
- README.md
- src/alignment/nanopolish_alignment_db.cpp
- src/common/nanopolish_common.h
- src/common/nanopolish_variant.cpp
- src/hmm/nanopolish_profile_hmm_r9.cpp
- src/hmm/nanopolish_profile_hmm_r9.inl
- src/io/nanopolish_fast5_io.cpp
- src/nanopolish_call_variants.cpp
- src/nanopolish_index.cpp
- src/nanopolish_squiggle_read.cpp
Changes:
=====================================
Makefile
=====================================
@@ -105,7 +105,7 @@ htslib/libhts.a:
$(MAKE) -C htslib htslib_default_libs="-lz -lm -lbz2" || exit 255
minimap2/libminimap2.a:
- $(MAKE) -C minimap2 $(MINIMAP2_OPT) || exit 255
+ $(MAKE) -C minimap2 $(MINIMAP2_OPT) libminimap2.a || exit 255
#
# If this library is a dependency the user wants HDF5 to be downloaded and built.
=====================================
README.md
=====================================
@@ -6,6 +6,20 @@ Software package for signal-level analysis of Oxford Nanopore sequencing data. N
## Release notes
+* 0.13.1: fix `nanopolish index` performance issue for some barcoding runs
+
+* 0.13.0: modify HMM transitions to allow the balance between insertions and deletions to be changed depending on mode (consensus vs reference variants)
+
+* 0.12.5: make SupportFractionByStrand calculation consistent with SupportFraction
+
+* 0.12.4: add SupportFractionByStrand and SOR to VCF
+
+* 0.12.3: fix hdf5 file handle leak
+
+* 0.12.2: add RefContext info to VCF output of `nanopolish variants`
+
+* 0.12.1: improve how `nanopolish index` handles summary files, add support for selecting reads by BAM read group tags (for `nanopolish variants`)
+
* 0.12.0: live methylation calling, methylation LLR threshold changes as described [here](http://simpsonlab.github.io/2020/03/03/nanopolish-v0.12.0/)
* 0.11.1: `nanopolish polya` now supports SQK-RNA-002 kits with automatic backwards-compatibility with SQK-RNA-001
=====================================
src/alignment/nanopolish_alignment_db.cpp
=====================================
@@ -448,7 +448,10 @@ BamHandles _initialize_bam_itr(const std::string& bam_filename,
// load bam file
handles.bam_fh = sam_open(bam_filename.c_str(), "r");
- assert(handles.bam_fh != NULL);
+ if(handles.bam_fh == NULL) {
+ fprintf(stderr, "Error - could not open bam file %s for read\n", bam_filename.c_str());
+ exit(EXIT_FAILURE);
+ }
// load bam index file
hts_idx_t* bam_idx = sam_index_load(handles.bam_fh, bam_filename.c_str());
=====================================
src/common/nanopolish_common.h
=====================================
@@ -18,7 +18,7 @@
#include "logsum.h"
#define PACKAGE_NAME "nanopolish"
-#define PACKAGE_VERSION "0.12.1"
+#define PACKAGE_VERSION "0.13.1"
#define PACKAGE_BUGREPORT "https://github.com/jts/nanopolish/issues"
//
=====================================
src/common/nanopolish_variant.cpp
=====================================
@@ -260,6 +260,21 @@ void score_variant_group(VariantGroup& variant_group,
#endif
}
+double calculate_sor(double ref_fwd, double ref_rev, double alt_fwd, double alt_rev)
+{
+ // to avoid zeros
+ ref_fwd += 1;
+ ref_rev += 1;
+ alt_fwd += 1;
+ alt_rev += 1;
+
+ double r = (ref_fwd * alt_rev) / (alt_fwd * ref_rev);
+ double sym_ratio = r + (1.0/r);
+ double ref_ratio = std::min(ref_fwd, ref_rev) / std::max(ref_fwd, ref_rev);
+ double alt_ratio = std::min(alt_fwd, alt_rev) / std::max(alt_fwd, alt_rev);
+ return log(sym_ratio) + log(ref_ratio) - log(alt_ratio);
+}
+
std::vector<Variant> simple_call(VariantGroup& variant_group,
const int ploidy,
const bool genotype_all_input_variants)
@@ -290,7 +305,7 @@ std::vector<Variant> simple_call(VariantGroup& variant_group,
// The current combination is represented as a vector of haplotype IDs
std::vector<size_t> current_set = vc_sets.get();
-
+
// Check if the current set consists of entirely of haplotypes without variants
bool is_base_set = true;
for(size_t i = 0; i < current_set.size(); ++i) {
@@ -324,7 +339,7 @@ std::vector<Variant> simple_call(VariantGroup& variant_group,
*/
set_score += set_sum;
}
-
+
if(is_base_set) {
base_score = set_score;
base_set = current_set;
@@ -383,7 +398,7 @@ std::vector<Variant> simple_call(VariantGroup& variant_group,
}
}
- std::vector<std::vector<int> > allele_strand_support;
+ std::vector<std::vector<double> > allele_strand_support;
for(size_t var_idx = 0; var_idx < variant_group.get_num_variants(); ++var_idx) {
allele_strand_support.push_back({0, 0, 0, 0});
@@ -391,10 +406,9 @@ std::vector<Variant> simple_call(VariantGroup& variant_group,
const std::string& read_id = group_reads[ri].first;
size_t strand = variant_group.is_read_rc(read_id);
double pp_alt = get(read_variant_assignment, ri, var_idx);
- size_t alt_support = pp_alt > 0.5;
- size_t idx = 2 * alt_support + strand;
- allele_strand_support[var_idx][idx] += 1;
+ allele_strand_support[var_idx][0 + strand] += (1 - pp_alt);
+ allele_strand_support[var_idx][2 + strand] += pp_alt;
}
}
@@ -424,19 +438,39 @@ std::vector<Variant> simple_call(VariantGroup& variant_group,
v.add_info("AlleleCount", var_count);
v.add_info("SupportFraction", read_variant_support[vi] / group_reads.size());
+ double ref_fwd = allele_strand_support[vi][0];
+ double ref_rev = allele_strand_support[vi][1];
+
+ double alt_fwd = allele_strand_support[vi][2];
+ double alt_rev = allele_strand_support[vi][3];
+
+ std::stringstream sfbs;
+ sfbs << ((double)alt_fwd / (ref_fwd + alt_fwd)) << ","
+ << ((double)alt_rev / (ref_rev + alt_rev));
+ v.add_info("SupportFractionByStrand", sfbs.str());
+
std::stringstream ssss;
- ssss << allele_strand_support[vi][0] << "," << allele_strand_support[vi][1] << ","
- << allele_strand_support[vi][2] << "," << allele_strand_support[vi][3];
+ ssss << (int)std::round(ref_fwd) << "," << (int)std::round(ref_rev) << ","
+ << (int)std::round(alt_fwd) << "," << (int)std::round(alt_rev);
v.add_info("StrandSupport", ssss.str());
double left, right, two;
- kt_fisher_exact(allele_strand_support[vi][0], allele_strand_support[vi][1],
- allele_strand_support[vi][2], allele_strand_support[vi][3],
+ kt_fisher_exact(ref_fwd, ref_rev,
+ alt_fwd, alt_rev,
&left, &right, &two);
int fisher_scaled = (int)(-4.343 * log(two) + .499);
+
+ // clamp when there's underflow
+ if(fisher_scaled < 0) {
+ fisher_scaled = 1000;
+ }
v.add_info("StrandFisherTest", fisher_scaled);
+ // GATK StrandOddsRatio from: https://gatk.broadinstitute.org/hc/en-us/articles/360036732071-StrandOddsRatio
+ double sor = calculate_sor(ref_fwd, ref_rev, alt_fwd, alt_rev);
+ v.add_info("SOR", sor);
+
/*
if(fisher_scaled > 30) {
v.filter = "StrandBias";
=====================================
src/hmm/nanopolish_profile_hmm_r9.cpp
=====================================
@@ -11,6 +11,13 @@
//#define DEBUG_FILL
//#define PRINT_TRAINING_MESSAGES 1
+// nanopolish consensus is designed to correct
+// draft assemblies, which tend to have more deletions
+// than insertions, so the default HMM parameters favor insertions.
+// this doesn't hold for other modes (e.g. reference-based variant calling) so we
+// provide this value to change the deletion/insertion balance
+double hmm_indel_bias_factor = 1.0;
+
void profile_hmm_forward_initialize_r9(FloatMatrix& fm)
{
// initialize forward calculation
=====================================
src/hmm/nanopolish_profile_hmm_r9.inl
=====================================
@@ -12,11 +12,15 @@
#define TRANS_CLIP_SELF 0.9
#define TRANS_START_TO_CLIP 0.5
+extern double hmm_indel_bias_factor;
+
inline std::vector<BlockTransitions> calculate_transitions(uint32_t num_kmers, const HMMInputSequence& sequence, const HMMInputData& data)
{
std::vector<BlockTransitions> transitions(num_kmers);
-
+
double read_events_per_base = data.read->events_per_base[data.strand];
+ read_events_per_base *= hmm_indel_bias_factor;
+ read_events_per_base = std::max(1.25, read_events_per_base);
for(uint32_t ki = 0; ki < num_kmers; ++ki) {
=====================================
src/io/nanopolish_fast5_io.cpp
=====================================
@@ -244,8 +244,9 @@ uint64_t fast5_get_start_time(fast5_file& fh, const std::string& read_id)
#endif
return 0;
}
-
- return fast5_read_uint64_attribute(group, "start_time");
+ uint64_t t = fast5_read_uint64_attribute(group, "start_time");
+ H5Gclose(group);
+ return t;
}
//
=====================================
src/nanopolish_call_variants.cpp
=====================================
@@ -89,6 +89,8 @@ static const char *CONSENSUS_USAGE_MESSAGE =
" -o, --outfile=FILE write result to FILE [default: stdout]\n"
" -t, --threads=NUM use NUM threads (default: 1)\n"
" -m, --min-candidate-frequency=F extract candidate variants from the aligned reads when the variant frequency is at least F (default 0.2)\n"
+" -i, --indel-bias=F bias HMM transition parameters to favor insertions (F<1) or deletions (F>1).\n"
+" this value is automatically set depending on --consensus, but can be manually set if spurious indels are called\n"
" -d, --min-candidate-depth=D extract candidate variants from the aligned reads when the depth is at least D (default: 20)\n"
" -x, --max-haplotypes=N consider at most N haplotype combinations (default: 1000)\n"
" --min-flanking-sequence=N distance from alignment end to calculate variants (default: 30)\n"
@@ -136,7 +138,7 @@ namespace opt
static std::vector<std::string> methylation_types;
}
-static const char* shortopts = "r:b:g:t:w:o:e:m:c:d:a:x:q:p:v";
+static const char* shortopts = "r:b:g:t:w:o:e:m:c:d:a:x:q:p:i:v";
enum { OPT_HELP = 1,
OPT_VERSION,
@@ -169,6 +171,7 @@ static const struct option longopts[] = {
{ "outfile", required_argument, NULL, 'o' },
{ "threads", required_argument, NULL, 't' },
{ "min-candidate-frequency", required_argument, NULL, 'm' },
+ { "indel-bias", required_argument, NULL, 'i' },
{ "min-candidate-depth", required_argument, NULL, 'd' },
{ "max-haplotypes", required_argument, NULL, 'x' },
{ "candidates", required_argument, NULL, 'c' },
@@ -208,7 +211,9 @@ std::string get_single_contig_or_fail()
}
const char* name = faidx_iseq(fai, 0);
- return std::string(name);
+ std::string ret(name);
+ fai_destroy(fai);
+ return ret;
}
int get_contig_length(const std::string& contig)
@@ -1005,10 +1010,13 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
return called_haplotype;
}
+extern double hmm_indel_bias_factor;
+
void parse_call_variants_options(int argc, char** argv)
{
std::string methylation_motifs_str;
bool die = false;
+ bool bias_override = false;
for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
std::istringstream arg(optarg != NULL ? optarg : "");
switch (c) {
@@ -1019,6 +1027,7 @@ void parse_call_variants_options(int argc, char** argv)
case 'w': arg >> opt::window; break;
case 'o': arg >> opt::output_file; break;
case 'm': arg >> opt::min_candidate_frequency; break;
+ case 'i': arg >> hmm_indel_bias_factor; bias_override = true; break;
case 'd': arg >> opt::min_candidate_depth; break;
case 'x': arg >> opt::max_haplotypes; break;
case 'c': arg >> opt::candidates_file; break;
@@ -1102,6 +1111,11 @@ void parse_call_variants_options(int argc, char** argv)
}
}
+ // set hmm indel bias, if not overridden
+ if(!bias_override) {
+ hmm_indel_bias_factor = opt::consensus_mode ? 0.9 : 0.8;
+ }
+
if (die)
{
std::cout << "\n" << CONSENSUS_USAGE_MESSAGE;
@@ -1171,10 +1185,6 @@ int call_variants_main(int argc, char** argv)
header_fields.push_back(Variant::make_vcf_header_key_value("nanopolish_window", polish_window.str()));
//
- header_fields.push_back(
- Variant::make_vcf_tag_string("FILTER", "StrandBias", 1, "Integer",
- "Variant failed the fisher strand bias test"));
-
header_fields.push_back(
Variant::make_vcf_tag_string("INFO", "TotalReads", 1, "Integer",
"The number of event-space reads used to call the variant"));
@@ -1182,6 +1192,10 @@ int call_variants_main(int argc, char** argv)
header_fields.push_back(
Variant::make_vcf_tag_string("INFO", "SupportFraction", 1, "Float",
"The fraction of event-space reads that support the variant"));
+
+ header_fields.push_back(
+ Variant::make_vcf_tag_string("INFO", "SupportFractionByStrand", 2, "Float",
+ "Fraction of event-space reads that support the variant for each strand"));
header_fields.push_back(
Variant::make_vcf_tag_string("INFO", "BaseCalledReadsWithVariant", 1, "Integer",
@@ -1194,14 +1208,22 @@ int call_variants_main(int argc, char** argv)
header_fields.push_back(
Variant::make_vcf_tag_string("INFO", "AlleleCount", 1, "Integer",
"The inferred number of copies of the allele"));
-
+
header_fields.push_back(
Variant::make_vcf_tag_string("INFO", "StrandSupport", 4, "Integer",
"Number of reads supporting the REF and ALT allele, by strand"));
-
+
header_fields.push_back(
Variant::make_vcf_tag_string("INFO", "StrandFisherTest", 1, "Integer",
"Strand bias fisher test"));
+
+ header_fields.push_back(
+ Variant::make_vcf_tag_string("INFO", "SOR", 1, "Float",
+ "StrandOddsRatio test from GATK"));
+
+ header_fields.push_back(
+ Variant::make_vcf_tag_string("INFO", "RefContext", 1, "String",
+ "The reference sequence context surrounding the variant call"));
if(opt::calculate_all_support) {
header_fields.push_back(
@@ -1228,12 +1250,27 @@ int call_variants_main(int argc, char** argv)
}
// write the variants
- for(const auto& v : haplotype.get_variants()) {
+ faidx_t *fai = fai_load(opt::genome_file.c_str());
+ for(auto& v : haplotype.get_variants()) {
if(!opt::snps_only || v.is_snp()) {
+ int context_start_base = v.ref_position - 5;
+ if(context_start_base < 0) {
+ context_start_base = 0;
+ }
+ int context_end_base = v.ref_position + v.ref_seq.length() + 4;
+ if(context_end_base >= contig_length) {
+ context_end_base = contig_length - 1;
+ }
+
+ int len;
+ std::string context = get_reference_region_ts(fai, v.ref_name.c_str(), context_start_base, context_end_base, &len);
+ v.add_info("RefContext", context);
v.write_vcf(out_fp);
}
}
+ fai_destroy(fai);
+ fai = NULL;
//
if(out_fp != stdout) {
=====================================
src/nanopolish_index.cpp
=====================================
@@ -306,7 +306,7 @@ void clean_fast5_map(std::multimap<std::string, std::string>& mmap)
std::vector<std::string> invalid_entries;
for(const auto& iter : fast5_read_count) {
- if(iter.second != EXPECTED_ENTRIES_PER_FAST5) {
+ if(iter.second > EXPECTED_ENTRIES_PER_FAST5) {
//fprintf(stderr, "warning: %s has %d entries in the summary and will be indexed the slow way\n", iter.first.c_str(), iter.second);
invalid_entries.push_back(iter.first);
}
=====================================
src/nanopolish_squiggle_read.cpp
=====================================
@@ -110,15 +110,17 @@ void SquiggleRead::init(const std::string& read_sequence, const Fast5Data& data,
this->events_per_base[0] = events_per_base[1] = 0.0f;
this->base_model[0] = this->base_model[1] = NULL;
g_total_reads += 1;
- assert(data.is_valid);
- assert(data.rt.n > 0);
this->read_name = data.read_name;
this->read_sequence = read_sequence;
// sometimes the basecaller will emit very short sequences, which causes problems
- if(this->read_sequence.length() > 20) {
+ // also there can be rare issues with the signal in the fast5 and we want to skip
+ // such reads
+ if(this->read_sequence.length() > 20 && data.is_valid && data.rt.n > 0) {
load_from_raw(data, flags);
+ } else {
+ g_bad_fast5_file += 1;
}
if(!this->events[0].empty()) {
View it on GitLab: https://salsa.debian.org/med-team/nanopolish/-/commit/6e9c69326f22176c09b9e1e33fc8f2428fdd0d78
--
View it on GitLab: https://salsa.debian.org/med-team/nanopolish/-/commit/6e9c69326f22176c09b9e1e33fc8f2428fdd0d78
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200423/70467385/attachment-0001.html>
More information about the debian-med-commit
mailing list