[med-svn] [Git][med-team/nanopolish][upstream] New upstream version 0.11.1
Andreas Tille
gitlab at salsa.debian.org
Wed Jul 10 12:24:30 BST 2019
Andreas Tille pushed to branch upstream at Debian Med / nanopolish
Commits:
7868af61 by Andreas Tille at 2019-07-10T11:12:03Z
New upstream version 0.11.1
- - - - -
10 changed files:
- .gitignore
- .travis.yml
- + Dockerfile-arm
- Makefile
- docs/source/index.rst
- docs/source/manual.rst
- scripts/calculate_methylation_frequency.py
- src/common/nanopolish_common.h
- src/nanopolish_index.cpp
- src/nanopolish_polya_estimator.cpp
Changes:
=====================================
.gitignore
=====================================
@@ -17,7 +17,7 @@ include/
lib/
share/
-hdf5-1.8.14.tar.gz
+hdf5-1.*.tar.gz
3.2.5.tar.bz2
eigen/
local*
=====================================
.travis.yml
=====================================
@@ -1,27 +1,83 @@
# travis.yml for github.com/jts/nanopolish
dist: trusty
+services: docker
sudo: false
language: generic
cache: apt
git:
depth: 1
+.nanopolish.ci.matrix-definitions:
+ - &cron-only
+ if: type = cron OR env(CI_CRON) = true
+ - &arch
+ before_install:
+ - |
+ sed -i "/^FROM / s/arm64/${ARCH}/" Dockerfile-arm
+ - |
+ docker run --rm --privileged \
+ multiarch/qemu-user-static:register --reset && \
+ docker build --rm -t nanopolish -f Dockerfile-arm .
+ script:
+ - |
+ docker run --rm -t \
+ -e HDF5="${HDF5:-install}" \
+ -e H5_CFLAGS="${H5_CFLAGS}" \
+ -e H5_INCLUDE="${H5_INCLUDE}" \
+ -e LDFLAGS="${LDFLAGS}" \
+ nanopolish
+
matrix:
include:
- # Set env for both nanoplish and the dependency hdf5.
- - env:
- - CC=gcc-4.8
- - CXX=g++-4.8
- - AR=gcc-ar-4.8
- - NM=gcc-nm-4.8
- - RANLIB=gcc-ranlib-4.8
- - env:
- - CC=gcc-8
- - CXX=g++-8
- - AR=gcc-ar-8
- - NM=gcc-nm-8
- - RANLIB=gcc-ranlib-8
+ # Set env for both nanoplish and the dependency hdf5.
+ - env:
+ - CC=gcc-4.8
+ - CXX=g++-4.8
+ - AR=gcc-ar-4.8
+ - NM=gcc-nm-4.8
+ - RANLIB=gcc-ranlib-4.8
+ - env:
+ - CC=gcc-8
+ - CXX=g++-8
+ - AR=gcc-ar-8
+ - NM=gcc-nm-8
+ - RANLIB=gcc-ranlib-8
+ # aarch64 - ARM 64-bit
+ - name: aarch64
+ sudo: required
+ env:
+ - ARCH=arm64
+ <<: *arch
+ <<: *cron-only
+ - name: aarch64-system-hdf5
+ sudo: required
+ env:
+ - ARCH=arm64
+ - HDF5="system"
+ - H5_INCLUDE="-I/usr/include/hdf5/serial"
+ - LDFLAGS="-L/usr/lib/aarch64-linux-gnu/hdf5/serial"
+ <<: *arch
+ # armv7l - ARM 32-bit
+ - name: armv7l
+ sudo: required
+ env:
+ - ARCH=armhf
+ <<: *arch
+ <<: *cron-only
+ - name: armv7l-system-hdf5
+ sudo: required
+ env:
+ - ARCH=armhf
+ - HDF5="system"
+ - H5_INCLUDE="-I/usr/include/hdf5/serial"
+ - LDFLAGS="-L/usr/lib/arm-linux-gnueabihf/hdf5/serial"
+ <<: *arch
+ allow_failures:
+ # The jobs installing hdf5 from source in docker finishes with error
+ # because of the job exceeded the maximum time limit (50 minutes).
+ - name: aarch64
+ - name: armv7l
# Install and export newer gcc
before_install:
@@ -38,9 +94,11 @@ before_install:
sudo apt-get install -qq "${CXX}"
fi
-script:
+before_script:
# Suppress all compiler warnings for hdf5 Makefile
# to display the log without downloading the raw log on Travis log page.
# Travis finishs with error when exceeding the limit of 4 MB of log length.
- export H5_CFLAGS="-w"
+
+script:
- make && make test
=====================================
Dockerfile-arm
=====================================
@@ -0,0 +1,18 @@
+FROM multiarch/ubuntu-debootstrap:arm64-bionic
+RUN uname -a
+RUN apt-get update -qq && \
+ apt-get install -yq --no-install-suggests --no-install-recommends \
+ bzip2 \
+ ca-certificates \
+ gcc \
+ g++ \
+ make \
+ software-properties-common
+RUN add-apt-repository -y universe && \
+ apt-get update -qq && \
+ apt-get install -yq libhdf5-dev
+RUN find /usr/include -name "hdf5.h" || true
+RUN find /usr/lib -name "libhdf5.a" || true
+WORKDIR /nanopolish
+COPY . .
+CMD exec bash -c "make && make test"
=====================================
Makefile
=====================================
@@ -12,6 +12,7 @@ LIBS = -lz
CXXFLAGS ?= -g -O3
CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -D_FILE_OFFSET_BITS=64 #D_FILE_OFFSET_BITS=64 makes nanopolish work in 32 bit systems
CFLAGS ?= -O3 -std=c99 -fsigned-char -D_FILE_OFFSET_BITS=64
+LDFLAGS ?=
CXX ?= g++
CC ?= gcc
@@ -20,8 +21,7 @@ HDF5 ?= install
EIGEN ?= install
HTS ?= install
-HDF5_VERSION ?= 1.8.14
-HDF5_CONFIG_ARGS ?= --enable-threadsafe
+HDF5_VERSION ?= 1.10.4
EIGEN_VERSION ?= 3.2.5
# Check operating system, OSX doesn't have -lrt
@@ -38,7 +38,7 @@ ifeq ($(HDF5), install)
else
# Use system-wide hdf5
H5_LIB =
- H5_INCLUDE =
+ H5_INCLUDE ?=
LIBS += -lhdf5
endif
@@ -91,13 +91,13 @@ htslib/libhts.a:
#
lib/libhdf5.a:
if [ ! -e hdf5-$(HDF5_VERSION).tar.gz ]; then \
- version_major_minior=`echo "$(HDF5_VERSION)" | sed -E 's/\.[0-9]+$$//'`; \
- wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-$${version_major_minior}/hdf5-$(HDF5_VERSION)/src/hdf5-$(HDF5_VERSION).tar.gz; \
+ version_major_minor=`echo "$(HDF5_VERSION)" | sed -E 's/\.[0-9]+$$//'`; \
+ wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-$${version_major_minor}/hdf5-$(HDF5_VERSION)/src/hdf5-$(HDF5_VERSION).tar.gz; \
fi
tar -xzf hdf5-$(HDF5_VERSION).tar.gz || exit 255
cd hdf5-$(HDF5_VERSION) && \
- ./configure --enable-threadsafe --libdir=`pwd`/../lib --includedir=`pwd`/../include --prefix=`pwd`/.. && \
+ ./configure --enable-threadsafe --disable-hl --libdir=`pwd`/../lib --includedir=`pwd`/../include --prefix=`pwd`/.. && \
make && make install
# Download and install eigen if not already downloaded
=====================================
docs/source/index.rst
=====================================
@@ -15,6 +15,7 @@ nanopolish
quickstart_consensus
quickstart_eventalign
quickstart_call_methylation
+ quickstart_polya
debug
manual
=====================================
docs/source/manual.rst
=====================================
@@ -11,6 +11,7 @@ Modules available: ::
nanopolish variants --consensus: calculate an improved consensus sequence for a draft genome assembly
nanopolish eventalign: align signal-level events to k-mers of a reference genome
nanopolish phase-reads: Phase reads using heterozygous SNVs with respect to a reference genome
+ nanopolish polya: Estimate polyadenylated tail lengths on native RNA reads
|
extract
@@ -486,3 +487,63 @@ Usage example
- print out a progress message
+polya
+--------------------
+
+Overview
+"""""""""""""""""""""""
+
+Estimate the number of nucleotides in the poly(A) tails of native RNA reads.
+
+Input
+"""""""""""""""""""""""
+
+ * basecalled reads
+ * alignment information
+ * reference transcripts
+
+Usage example
+"""""""""""""""""""""""
+
+::
+
+ nanopolish polya [OPTIONS] --reads=reads.fa --bam=alignments.bam --genome=ref.fa
+
+.. list-table::
+ :widths: 20 10 20 50
+ :header-rows: 1
+
+ * - Argument name(s)
+ - Required
+ - Default value
+ - Description
+
+ * - ``-w, --window=STR``
+ - N
+ - NA
+ - Compute only for reads aligning to window of reference STR (format : ctg:start_id-end_id)
+
+ * - ``-r, --reads=FILE``
+ - Y
+ - NA
+ - the FAST(A/Q) file of native RNA reads
+
+ * - ``-b, --bam=FILE``
+ - Y
+ - NA
+ - the BAM file of alignments between reads and the reference
+
+ * - ``-g, --genome=FILE``
+ - Y
+ - NA
+ - the reference transcripts
+
+ * - ``-t, --threads=NUM``
+ - N
+ - 1
+ - use NUM threads
+
+ * - ``-v, -vv``
+ - N
+ - NA
+ - `-v` returns raw sample log-likelihoods, while `-vv` returns event durations
=====================================
scripts/calculate_methylation_frequency.py
=====================================
@@ -6,17 +6,9 @@ import csv
import argparse
from collections import namedtuple
-def make_key(c, s, e):
- return c + ":" + str(s) + ":" + str(e)
-
-def split_key(k):
- f = k.split(":")
- return (f[0], int(f[1]), int(f[2]))
-
class SiteStats:
def __init__(self, g_size, g_seq):
self.num_reads = 0
- self.posterior_methylated = 0
self.called_sites = 0
self.called_sites_methylated = 0
self.group_size = g_size
@@ -60,7 +52,7 @@ for record in csv_reader:
# if this is a multi-cpg group and split_groups is set, break up these sites
if args.split_groups and num_sites > 1:
- c = record['chromosome']
+ c = str(record['chromosome'])
s = int(record['start'])
e = int(record['end'])
@@ -69,20 +61,20 @@ for record in csv_reader:
cg_pos = sequence.find("CG")
first_cg_pos = cg_pos
while cg_pos != -1:
- key = make_key(c, s + cg_pos - first_cg_pos, s + cg_pos - first_cg_pos)
+ key = (c, s + cg_pos - first_cg_pos, s + cg_pos - first_cg_pos)
update_call_stats(key, 1, is_methylated, "split-group")
cg_pos = sequence.find("CG", cg_pos + 1)
else:
- key = make_key(record['chromosome'], record['start'], record['end'])
+ key = (str(record['chromosome']), int(record['start']), int(record['end']))
update_call_stats(key, num_sites, is_methylated, sequence)
# header
print("\t".join(["chromosome", "start", "end", "num_motifs_in_group", "called_sites", "called_sites_methylated", "methylated_frequency", "group_sequence"]))
-sorted_keys = sorted(sites.keys(), key = lambda x: split_key(x))
+sorted_keys = sorted(sites.keys(), key = lambda x: x)
for key in sorted_keys:
if sites[key].called_sites > 0:
- (c, s, e) = key.split(":")
+ (c, s, e) = key
f = float(sites[key].called_sites_methylated) / sites[key].called_sites
print("%s\t%s\t%s\t%d\t%d\t%d\t%.3f\t%s" % (c, s, e, sites[key].group_size, sites[key].called_sites, sites[key].called_sites_methylated, f, sites[key].sequence))
=====================================
src/common/nanopolish_common.h
=====================================
@@ -18,7 +18,7 @@
#include "logsum.h"
#define PACKAGE_NAME "nanopolish"
-#define PACKAGE_VERSION "0.11.0"
+#define PACKAGE_VERSION "0.11.1"
#define PACKAGE_BUGREPORT "https://github.com/jts/nanopolish/issues"
//
=====================================
src/nanopolish_index.cpp
=====================================
@@ -56,7 +56,7 @@ namespace opt
static std::ostream* os_p;
//
-void index_file_from_map(ReadDB& read_db, const std::string& fn, const std::map<std::string, std::string>& fast5_to_read_name_map)
+void index_file_from_map(ReadDB& read_db, const std::string& fn, const std::multimap<std::string, std::string>& fast5_to_read_name_map)
{
PROFILE_FUNC("index_file_from_map")
@@ -64,20 +64,16 @@ void index_file_from_map(ReadDB& read_db, const std::string& fn, const std::map<
size_t last_dir_pos = fn.find_last_of("/");
std::string fast5_basename = last_dir_pos == std::string::npos ? fn : fn.substr(last_dir_pos + 1);
- const auto& iter = fast5_to_read_name_map.find(fast5_basename);
- if(iter != fast5_to_read_name_map.end()) {
+ auto range = fast5_to_read_name_map.equal_range(fast5_basename);
+ for(auto iter = range.first; iter != range.second; ++iter) {
if(read_db.has_read(iter->second)) {
read_db.add_signal_path(iter->second.c_str(), fn);
}
- } else {
- if(opt::verbose > 0) {
- fprintf(stderr, "Could not find read %s in sequencing summary file\n", fn.c_str());
- }
}
}
//
-void index_file_from_fast5(ReadDB& read_db, const std::string& fn, const std::map<std::string, std::string>& fast5_to_read_name_map)
+void index_file_from_fast5(ReadDB& read_db, const std::string& fn)
{
PROFILE_FUNC("index_file_from_fast5")
@@ -106,7 +102,7 @@ void index_file_from_fast5(ReadDB& read_db, const std::string& fn, const std::ma
}
//
-void index_path(ReadDB& read_db, const std::string& path, const std::map<std::string, std::string>& fast5_to_read_name_map)
+void index_path(ReadDB& read_db, const std::string& path, const std::multimap<std::string, std::string>& fast5_to_read_name_map)
{
fprintf(stderr, "[readdb] indexing %s\n", path.c_str());
if (is_directory(path)) {
@@ -124,7 +120,7 @@ void index_path(ReadDB& read_db, const std::string& path, const std::map<std::st
if(!fast5_to_read_name_map.empty()) {
index_file_from_map(read_db, full_fn, fast5_to_read_name_map);
} else {
- index_file_from_fast5(read_db, full_fn, fast5_to_read_name_map);
+ index_file_from_fast5(read_db, full_fn);
}
}
}
@@ -160,7 +156,7 @@ void exit_bad_header(const std::string& str, const std::string& filename)
}
//
-void parse_sequencing_summary(const std::string& filename, std::map<std::string, std::string>& out_map)
+void parse_sequencing_summary(const std::string& filename, std::multimap<std::string, std::string>& out_map)
{
// open
std::ifstream in_file(filename.c_str());
@@ -203,7 +199,7 @@ void parse_sequencing_summary(const std::string& filename, std::map<std::string,
fields = split(line, '\t');
std::string fast5_filename = fields[filename_idx];
std::string read_name = fields[read_name_idx];
- out_map[fast5_filename] = read_name;
+ out_map.insert(std::make_pair(fast5_filename, read_name));
}
}
@@ -279,7 +275,7 @@ int index_main(int argc, char** argv)
// Read a map from fast5 files to read name from the sequencing summary files (if any)
process_summary_fofn();
- std::map<std::string, std::string> fast5_to_read_name_map;
+ std::multimap<std::string, std::string> fast5_to_read_name_map;
for(const auto& ss_filename : opt::sequencing_summary_files) {
if(opt::verbose > 2) {
fprintf(stderr, "summary: %s\n", ss_filename.c_str());
=====================================
src/nanopolish_polya_estimator.cpp
=====================================
@@ -220,15 +220,21 @@ private:
// T -> T (100%)
{0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 1.00f}
};
- // 50/50 chance of starting on L or S
- float start_probs[HMM_NUM_STATES] = { 0.50f, 0.50f, 0.00f, 0.00f, 0.00f, 0.00f };
+ // All state sequences must start on S:
+ float start_probs[HMM_NUM_STATES] = { 1.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f };
// ----- emission parameters:
// emission parameters, from empirical MLE on manually-flagged reads:
- // START, LEADER, and POLYA have Gaussian emissions;
+ // START has a mixture of Gaussian and Uniform emissions;
+ // LEADER and POLYA have Gaussian emissions;
// ADAPTER, TRANSCRIPT have Gaussian mixture emissions;
// CLIFF has Uniform emissions.
GaussianParameters s_emission = {70.2737f, 3.7743f};
+ float s_begin = 40.0f;
+ float s_end = 250.0f;
+ float s_prob = 0.00476f; // == {1. / (250.0f - 40.0f)}
+ float s_norm_coeff = 0.50f;
+ float s_unif_coeff = 0.50f;
GaussianParameters l_emission = {110.973f, 5.237f};
GaussianParameters a0_emission = {79.347f, 8.3702f};
GaussianParameters a1_emission = {63.3126f, 2.7464f};
@@ -265,7 +271,8 @@ private:
float log_probs;
if (state == HMM_START) {
// START state:
- log_probs = log_normal_pdf(xx, this->s_emission);
+ float norm_term = s_norm_coeff * normal_pdf(xx, this->s_emission);
+ log_probs = std::log(norm_term + s_unif_coeff * s_prob);
}
if (state == HMM_LEADER) {
// LEADER state:
@@ -680,14 +687,13 @@ std::string pre_segmentation_qc(uint32_t suffix_clip, uint32_t prefix_clip, doub
// These tests indicate that something went wrong in the segmentation algorithm.
std::string post_segmentation_qc(const Segmentation& region_indices, const SquiggleRead& sr)
{
- // fetch sizes of LEADER, ADAPTER, POLYA regions:
- double num_leader_samples = region_indices.leader;
+ // fetch sizes of ADAPTER and POLYA regions:
double num_adapter_samples = (region_indices.adapter+1) - region_indices.leader;
double num_polya_samples = region_indices.polya - (region_indices.adapter+1);
// check for NOREGION:
std::string qc_tag;
- if (num_leader_samples < 200.0 || num_adapter_samples < 200.0 || num_polya_samples < 200.0) {
+ if (num_adapter_samples < 200.0 || num_polya_samples < 200.0) {
qc_tag = "NOREGION";
} else {
qc_tag = "PASS";
@@ -733,6 +739,11 @@ void estimate_polya_for_single_read(const ReadDB& read_db,
int region_start,
int region_end)
{
+ //----- check if primary or secondary alignment by inspecting FLAG; skip read if secondary:
+ if (record->core.flag & BAM_FSECONDARY) {
+ return;
+ }
+
//----- load a squiggle read:
std::string read_name = bam_get_qname(record);
std::string ref_name(hdr->target_name[record->core.tid]);
View it on GitLab: https://salsa.debian.org/med-team/nanopolish/commit/7868af613b689fbb895f987c5608a25cb8c32549
--
View it on GitLab: https://salsa.debian.org/med-team/nanopolish/commit/7868af613b689fbb895f987c5608a25cb8c32549
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190710/232344b0/attachment-0001.html>
More information about the debian-med-commit
mailing list