[med-svn] [Git][med-team/nanopolish][upstream] New upstream version 0.11.1

Andreas Tille gitlab at salsa.debian.org
Wed Jul 10 12:24:30 BST 2019



Andreas Tille pushed to branch upstream at Debian Med / nanopolish


Commits:
7868af61 by Andreas Tille at 2019-07-10T11:12:03Z
New upstream version 0.11.1
- - - - -


10 changed files:

- .gitignore
- .travis.yml
- + Dockerfile-arm
- Makefile
- docs/source/index.rst
- docs/source/manual.rst
- scripts/calculate_methylation_frequency.py
- src/common/nanopolish_common.h
- src/nanopolish_index.cpp
- src/nanopolish_polya_estimator.cpp


Changes:

=====================================
.gitignore
=====================================
@@ -17,7 +17,7 @@ include/
 lib/
 share/
 
-hdf5-1.8.14.tar.gz
+hdf5-1.*.tar.gz
 3.2.5.tar.bz2
 eigen/
 local*


=====================================
.travis.yml
=====================================
@@ -1,27 +1,83 @@
 # travis.yml for github.com/jts/nanopolish
 
 dist: trusty
+services: docker
 sudo: false
 language: generic
 cache: apt
 git:
     depth: 1
 
+.nanopolish.ci.matrix-definitions:
+    - &cron-only
+      if: type = cron OR env(CI_CRON) = true
+    - &arch
+      before_install:
+          - |
+            sed -i "/^FROM / s/arm64/${ARCH}/" Dockerfile-arm
+          - |
+            docker run --rm --privileged \
+                multiarch/qemu-user-static:register --reset && \
+                docker build --rm -t nanopolish -f Dockerfile-arm .
+      script:
+          - |
+            docker run --rm -t \
+                -e HDF5="${HDF5:-install}" \
+                -e H5_CFLAGS="${H5_CFLAGS}" \
+                -e H5_INCLUDE="${H5_INCLUDE}" \
+                -e LDFLAGS="${LDFLAGS}" \
+                nanopolish
+
 matrix:
     include:
-    # Set env for both nanoplish and the dependency hdf5.
-    - env:
-        - CC=gcc-4.8
-        - CXX=g++-4.8
-        - AR=gcc-ar-4.8
-        - NM=gcc-nm-4.8
-        - RANLIB=gcc-ranlib-4.8
-    - env:
-        - CC=gcc-8
-        - CXX=g++-8
-        - AR=gcc-ar-8
-        - NM=gcc-nm-8
-        - RANLIB=gcc-ranlib-8
+        # Set env for both nanoplish and the dependency hdf5.
+        - env:
+              - CC=gcc-4.8
+              - CXX=g++-4.8
+              - AR=gcc-ar-4.8
+              - NM=gcc-nm-4.8
+              - RANLIB=gcc-ranlib-4.8
+        - env:
+              - CC=gcc-8
+              - CXX=g++-8
+              - AR=gcc-ar-8
+              - NM=gcc-nm-8
+              - RANLIB=gcc-ranlib-8
+        # aarch64 - ARM 64-bit
+        - name: aarch64
+          sudo: required
+          env:
+              - ARCH=arm64
+          <<: *arch
+          <<: *cron-only
+        - name: aarch64-system-hdf5
+          sudo: required
+          env:
+              - ARCH=arm64
+              - HDF5="system"
+              - H5_INCLUDE="-I/usr/include/hdf5/serial"
+              - LDFLAGS="-L/usr/lib/aarch64-linux-gnu/hdf5/serial"
+          <<: *arch
+        # armv7l - ARM 32-bit
+        - name: armv7l
+          sudo: required
+          env:
+              - ARCH=armhf
+          <<: *arch
+          <<: *cron-only
+        - name: armv7l-system-hdf5
+          sudo: required
+          env:
+              - ARCH=armhf
+              - HDF5="system"
+              - H5_INCLUDE="-I/usr/include/hdf5/serial"
+              - LDFLAGS="-L/usr/lib/arm-linux-gnueabihf/hdf5/serial"
+          <<: *arch
+    allow_failures:
+        # The jobs installing hdf5 from source in docker finishes with error
+        # because of the job exceeded the maximum time limit (50 minutes).
+        - name: aarch64
+        - name: armv7l
 
 # Install and export newer gcc
 before_install:
@@ -38,9 +94,11 @@ before_install:
           sudo apt-get install -qq "${CXX}"
       fi
 
-script:
+before_script:
     # Suppress all compiler warnings for hdf5 Makefile
     # to display the log without downloading the raw log on Travis log page.
     # Travis finishs with error when exceeding the limit of 4 MB of log length.
     - export H5_CFLAGS="-w"
+
+script:
     - make && make test


=====================================
Dockerfile-arm
=====================================
@@ -0,0 +1,18 @@
+FROM multiarch/ubuntu-debootstrap:arm64-bionic
+RUN uname -a
+RUN apt-get update -qq && \
+  apt-get install -yq --no-install-suggests --no-install-recommends \
+  bzip2 \
+  ca-certificates \
+  gcc \
+  g++ \
+  make \
+  software-properties-common
+RUN add-apt-repository -y universe && \
+  apt-get update -qq && \
+  apt-get install -yq libhdf5-dev
+RUN find /usr/include -name "hdf5.h" || true
+RUN find /usr/lib -name "libhdf5.a" || true
+WORKDIR /nanopolish
+COPY . .
+CMD exec bash -c "make && make test"


=====================================
Makefile
=====================================
@@ -12,6 +12,7 @@ LIBS = -lz
 CXXFLAGS ?= -g -O3
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -D_FILE_OFFSET_BITS=64 #D_FILE_OFFSET_BITS=64 makes nanopolish work in 32 bit systems
 CFLAGS ?= -O3 -std=c99 -fsigned-char -D_FILE_OFFSET_BITS=64 
+LDFLAGS ?=
 CXX ?= g++
 CC ?= gcc
 
@@ -20,8 +21,7 @@ HDF5 ?= install
 EIGEN ?= install
 HTS ?= install
 
-HDF5_VERSION ?= 1.8.14
-HDF5_CONFIG_ARGS ?= --enable-threadsafe
+HDF5_VERSION ?= 1.10.4
 EIGEN_VERSION ?= 3.2.5
 
 # Check operating system, OSX doesn't have -lrt
@@ -38,7 +38,7 @@ ifeq ($(HDF5), install)
 else
     # Use system-wide hdf5
     H5_LIB =
-    H5_INCLUDE =
+    H5_INCLUDE ?=
     LIBS += -lhdf5
 endif
 
@@ -91,13 +91,13 @@ htslib/libhts.a:
 #
 lib/libhdf5.a:
 	if [ ! -e hdf5-$(HDF5_VERSION).tar.gz ]; then \
-		version_major_minior=`echo "$(HDF5_VERSION)" | sed -E 's/\.[0-9]+$$//'`; \
-		wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-$${version_major_minior}/hdf5-$(HDF5_VERSION)/src/hdf5-$(HDF5_VERSION).tar.gz; \
+		version_major_minor=`echo "$(HDF5_VERSION)" | sed -E 's/\.[0-9]+$$//'`; \
+		wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-$${version_major_minor}/hdf5-$(HDF5_VERSION)/src/hdf5-$(HDF5_VERSION).tar.gz; \
 	fi
 
 	tar -xzf hdf5-$(HDF5_VERSION).tar.gz || exit 255
 	cd hdf5-$(HDF5_VERSION) && \
-        ./configure --enable-threadsafe --libdir=`pwd`/../lib --includedir=`pwd`/../include --prefix=`pwd`/.. && \
+		./configure --enable-threadsafe --disable-hl --libdir=`pwd`/../lib --includedir=`pwd`/../include --prefix=`pwd`/.. && \
 		make && make install
 
 # Download and install eigen if not already downloaded


=====================================
docs/source/index.rst
=====================================
@@ -15,6 +15,7 @@ nanopolish
    quickstart_consensus
    quickstart_eventalign
    quickstart_call_methylation
+   quickstart_polya
    debug
    manual
 


=====================================
docs/source/manual.rst
=====================================
@@ -11,6 +11,7 @@ Modules available: ::
     nanopolish variants --consensus: calculate an improved consensus sequence for a draft genome assembly
     nanopolish eventalign: align signal-level events to k-mers of a reference genome
     nanopolish phase-reads: Phase reads using heterozygous SNVs with respect to a reference genome 
+    nanopolish polya: Estimate polyadenylated tail lengths on native RNA reads
 |
 
 extract
@@ -486,3 +487,63 @@ Usage example
      - print out a progress message
 
  
+polya
+--------------------
+
+Overview
+"""""""""""""""""""""""
+
+Estimate the number of nucleotides in the poly(A) tails of native RNA reads.
+
+Input
+"""""""""""""""""""""""
+
+    * basecalled reads
+    * alignment information
+    * reference transcripts
+
+Usage example
+"""""""""""""""""""""""
+
+::
+
+   nanopolish polya [OPTIONS] --reads=reads.fa --bam=alignments.bam --genome=ref.fa
+
+.. list-table::
+   :widths: 20 10 20 50
+   :header-rows: 1
+
+   * - Argument name(s)
+     - Required
+     - Default value
+     - Description
+
+   * - ``-w, --window=STR``
+     - N
+     - NA
+     - Compute only for reads aligning to window of reference STR (format : ctg:start_id-end_id)
+
+   * - ``-r, --reads=FILE``
+     - Y
+     - NA
+     - the FAST(A/Q) file of native RNA reads
+
+   * - ``-b, --bam=FILE``
+     - Y
+     - NA
+     - the BAM file of alignments between reads and the reference
+
+   * - ``-g, --genome=FILE``
+     - Y
+     - NA
+     - the reference transcripts
+
+   * - ``-t, --threads=NUM``
+     - N
+     - 1
+     - use NUM threads
+
+   * - ``-v, -vv``
+     - N
+     - NA
+     - `-v` returns raw sample log-likelihoods, while `-vv` returns event durations


=====================================
scripts/calculate_methylation_frequency.py
=====================================
@@ -6,17 +6,9 @@ import csv
 import argparse
 from collections import namedtuple
 
-def make_key(c, s, e):
-    return c + ":" + str(s) + ":" + str(e)
-
-def split_key(k):
-    f = k.split(":")
-    return (f[0], int(f[1]), int(f[2]))
-
 class SiteStats:
     def __init__(self, g_size, g_seq):
         self.num_reads = 0
-        self.posterior_methylated = 0
         self.called_sites = 0
         self.called_sites_methylated = 0
         self.group_size = g_size
@@ -60,7 +52,7 @@ for record in csv_reader:
     
     # if this is a multi-cpg group and split_groups is set, break up these sites
     if args.split_groups and num_sites > 1:
-        c = record['chromosome'] 
+        c = str(record['chromosome']) 
         s = int(record['start'])
         e = int(record['end'])
 
@@ -69,20 +61,20 @@ for record in csv_reader:
         cg_pos = sequence.find("CG")
         first_cg_pos = cg_pos
         while cg_pos != -1:
-            key = make_key(c, s + cg_pos - first_cg_pos, s + cg_pos - first_cg_pos)
+            key = (c, s + cg_pos - first_cg_pos, s + cg_pos - first_cg_pos)
             update_call_stats(key, 1, is_methylated, "split-group")
             cg_pos = sequence.find("CG", cg_pos + 1)
     else:
-        key = make_key(record['chromosome'], record['start'], record['end'])
+        key = (str(record['chromosome']), int(record['start']), int(record['end']))
         update_call_stats(key, num_sites, is_methylated, sequence)
 
 # header
 print("\t".join(["chromosome", "start", "end", "num_motifs_in_group", "called_sites", "called_sites_methylated", "methylated_frequency", "group_sequence"]))
 
-sorted_keys = sorted(sites.keys(), key = lambda x: split_key(x))
+sorted_keys = sorted(sites.keys(), key = lambda x: x)
 
 for key in sorted_keys:
     if sites[key].called_sites > 0:
-        (c, s, e) = key.split(":")
+        (c, s, e) = key
         f = float(sites[key].called_sites_methylated) / sites[key].called_sites
         print("%s\t%s\t%s\t%d\t%d\t%d\t%.3f\t%s" % (c, s, e, sites[key].group_size, sites[key].called_sites, sites[key].called_sites_methylated, f, sites[key].sequence))


=====================================
src/common/nanopolish_common.h
=====================================
@@ -18,7 +18,7 @@
 #include "logsum.h"
 
 #define PACKAGE_NAME "nanopolish"
-#define PACKAGE_VERSION "0.11.0"
+#define PACKAGE_VERSION "0.11.1"
 #define PACKAGE_BUGREPORT "https://github.com/jts/nanopolish/issues"
 
 //


=====================================
src/nanopolish_index.cpp
=====================================
@@ -56,7 +56,7 @@ namespace opt
 static std::ostream* os_p;
 
 //
-void index_file_from_map(ReadDB& read_db, const std::string& fn, const std::map<std::string, std::string>& fast5_to_read_name_map)
+void index_file_from_map(ReadDB& read_db, const std::string& fn, const std::multimap<std::string, std::string>& fast5_to_read_name_map)
 {
     PROFILE_FUNC("index_file_from_map")
 
@@ -64,20 +64,16 @@ void index_file_from_map(ReadDB& read_db, const std::string& fn, const std::map<
     size_t last_dir_pos = fn.find_last_of("/");
     std::string fast5_basename = last_dir_pos == std::string::npos ? fn : fn.substr(last_dir_pos + 1);
 
-    const auto& iter = fast5_to_read_name_map.find(fast5_basename);
-    if(iter != fast5_to_read_name_map.end()) {
+    auto range = fast5_to_read_name_map.equal_range(fast5_basename);
+    for(auto iter = range.first; iter != range.second; ++iter) {
         if(read_db.has_read(iter->second)) {
             read_db.add_signal_path(iter->second.c_str(), fn);
         }
-    } else {
-        if(opt::verbose > 0) {
-            fprintf(stderr, "Could not find read %s in sequencing summary file\n", fn.c_str());
-        }
     }
 }
 
 //
-void index_file_from_fast5(ReadDB& read_db, const std::string& fn, const std::map<std::string, std::string>& fast5_to_read_name_map)
+void index_file_from_fast5(ReadDB& read_db, const std::string& fn)
 {
     PROFILE_FUNC("index_file_from_fast5")
 
@@ -106,7 +102,7 @@ void index_file_from_fast5(ReadDB& read_db, const std::string& fn, const std::ma
 }
 
 //
-void index_path(ReadDB& read_db, const std::string& path, const std::map<std::string, std::string>& fast5_to_read_name_map)
+void index_path(ReadDB& read_db, const std::string& path, const std::multimap<std::string, std::string>& fast5_to_read_name_map)
 {
     fprintf(stderr, "[readdb] indexing %s\n", path.c_str());
     if (is_directory(path)) {
@@ -124,7 +120,7 @@ void index_path(ReadDB& read_db, const std::string& path, const std::map<std::st
                 if(!fast5_to_read_name_map.empty()) {
                     index_file_from_map(read_db, full_fn, fast5_to_read_name_map);
                 } else {
-                    index_file_from_fast5(read_db, full_fn, fast5_to_read_name_map);
+                    index_file_from_fast5(read_db, full_fn);
                 }
             }
         }
@@ -160,7 +156,7 @@ void exit_bad_header(const std::string& str, const std::string& filename)
 }
 
 //
-void parse_sequencing_summary(const std::string& filename, std::map<std::string, std::string>& out_map)
+void parse_sequencing_summary(const std::string& filename, std::multimap<std::string, std::string>& out_map)
 {
     // open
     std::ifstream in_file(filename.c_str());
@@ -203,7 +199,7 @@ void parse_sequencing_summary(const std::string& filename, std::map<std::string,
         fields = split(line, '\t');
         std::string fast5_filename = fields[filename_idx];
         std::string read_name = fields[read_name_idx];
-        out_map[fast5_filename] = read_name;
+        out_map.insert(std::make_pair(fast5_filename, read_name));
     }
 }
 
@@ -279,7 +275,7 @@ int index_main(int argc, char** argv)
 
     // Read a map from fast5 files to read name from the sequencing summary files (if any)
     process_summary_fofn();
-    std::map<std::string, std::string> fast5_to_read_name_map;
+    std::multimap<std::string, std::string> fast5_to_read_name_map;
     for(const auto& ss_filename : opt::sequencing_summary_files) {
         if(opt::verbose > 2) {
             fprintf(stderr, "summary: %s\n", ss_filename.c_str());


=====================================
src/nanopolish_polya_estimator.cpp
=====================================
@@ -220,15 +220,21 @@ private:
         // T -> T (100%)
         {0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 1.00f}
     };
-    // 50/50 chance of starting on L or S
-    float start_probs[HMM_NUM_STATES] = { 0.50f, 0.50f, 0.00f, 0.00f, 0.00f, 0.00f };
+    // All state sequences must start on S:
+    float start_probs[HMM_NUM_STATES] = { 1.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f };
 
     // ----- emission parameters:
     // emission parameters, from empirical MLE on manually-flagged reads:
-    // START, LEADER, and POLYA have Gaussian emissions;
+    // START has a mixture of Gaussian and Uniform emissions;
+    // LEADER and POLYA have Gaussian emissions;
     // ADAPTER, TRANSCRIPT have Gaussian mixture emissions;
     // CLIFF has Uniform emissions.
     GaussianParameters s_emission = {70.2737f, 3.7743f};
+    float s_begin = 40.0f;
+    float s_end = 250.0f;
+    float s_prob = 0.00476f; // == {1. / (250.0f - 40.0f)}
+    float s_norm_coeff = 0.50f;
+    float s_unif_coeff = 0.50f;
     GaussianParameters l_emission = {110.973f, 5.237f};
     GaussianParameters a0_emission = {79.347f, 8.3702f};
     GaussianParameters a1_emission = {63.3126f, 2.7464f};
@@ -265,7 +271,8 @@ private:
         float log_probs;
         if (state == HMM_START) {
             // START state:
-            log_probs = log_normal_pdf(xx, this->s_emission);
+            float norm_term = s_norm_coeff * normal_pdf(xx, this->s_emission);
+            log_probs = std::log(norm_term + s_unif_coeff * s_prob);
         }
         if (state == HMM_LEADER) {
             // LEADER state:
@@ -680,14 +687,13 @@ std::string pre_segmentation_qc(uint32_t suffix_clip, uint32_t prefix_clip, doub
 // These tests indicate that something went wrong in the segmentation algorithm.
 std::string post_segmentation_qc(const Segmentation& region_indices, const SquiggleRead& sr)
 {
-    // fetch sizes of LEADER, ADAPTER, POLYA regions:
-    double num_leader_samples = region_indices.leader;
+    // fetch sizes of ADAPTER and POLYA regions:
     double num_adapter_samples = (region_indices.adapter+1) - region_indices.leader;
     double num_polya_samples = region_indices.polya - (region_indices.adapter+1);
 
     // check for NOREGION:
     std::string qc_tag;
-    if (num_leader_samples < 200.0 || num_adapter_samples < 200.0 || num_polya_samples < 200.0) {
+    if (num_adapter_samples < 200.0 || num_polya_samples < 200.0) {
         qc_tag = "NOREGION";
     } else {
         qc_tag = "PASS";
@@ -733,6 +739,11 @@ void estimate_polya_for_single_read(const ReadDB& read_db,
                                     int region_start,
                                     int region_end)
 {
+    //----- check if primary or secondary alignment by inspecting FLAG; skip read if secondary:
+    if (record->core.flag & BAM_FSECONDARY) {
+        return;
+    }
+
     //----- load a squiggle read:
     std::string read_name = bam_get_qname(record);
     std::string ref_name(hdr->target_name[record->core.tid]);



View it on GitLab: https://salsa.debian.org/med-team/nanopolish/commit/7868af613b689fbb895f987c5608a25cb8c32549

-- 
View it on GitLab: https://salsa.debian.org/med-team/nanopolish/commit/7868af613b689fbb895f987c5608a25cb8c32549
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190710/232344b0/attachment-0001.html>


More information about the debian-med-commit mailing list