[med-svn] [Git][med-team/mosdepth][master] 3 commits: New upstream version 0.3.1
Steffen Möller
gitlab at salsa.debian.org
Mon Nov 2 21:50:41 GMT 2020
Steffen Möller pushed to branch master at Debian Med / mosdepth
Commits:
4466b303 by Steffen Moeller at 2020-11-02T22:39:00+01:00
New upstream version 0.3.1
- - - - -
6e9fc821 by Steffen Moeller at 2020-11-02T22:39:02+01:00
Update upstream source from tag 'upstream/0.3.1'
Update to upstream version '0.3.1'
with Debian dir 2db198d2533e9ccc8c79c1e2eee9983e973d8b1a
- - - - -
8f0e0415 by Steffen Moeller at 2020-11-02T22:50:09+01:00
New upstream version - compiles.
- - - - -
22 changed files:
- + .github/workflows/build.yml
- .gitignore
- .travis.yml
- CHANGES.md
- README.md
- + debian/TODO
- debian/changelog
- debian/control
- debian/copyright
- + debian/manpages
- + debian/mosdepth.install
- debian/rules
- debian/upstream/metadata
- debian/watch
- + depthstat.nim
- functional-tests.sh
- + int2str.nim
- mosdepth.nim
- mosdepth.nimble
- scripts/install.sh
- scripts/plot-dist.py
- tests/funcs.nim
Changes:
=====================================
.github/workflows/build.yml
=====================================
@@ -0,0 +1,113 @@
+# copied from Daniel Cook's Seq collection
+name: Build
+
+on:
+ - push
+ - pull_request
+
+jobs:
+ build:
+
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-18.04, macos-10.15]
+ nimversion:
+ - stable
+ - devel
+
+
+ steps:
+ - uses: actions/checkout at v2
+
+ # Caching
+ - name: Cache choosenim
+ id: cache-choosenim
+ uses: actions/cache at v1
+ with:
+ path: ~/.choosenim
+ key: ${{ runner.os }}-choosenim-stable
+
+ - name: Cache nimble
+ id: cache-nimble
+ uses: actions/cache at v1
+ with:
+ path: ~/.nimble
+ key: ${{ runner.os }}-nimble-stable
+
+ - name: Cache htslib
+ id: cache-htslib
+ uses: actions/cache at v1
+ with:
+ path: $HOME/htslib
+ key: ${{ runner.os }}-htslib-1.10
+
+ # Install Dependencies
+ - name: Install dependencies (Linux)
+ if: runner.os == 'Linux'
+ run: |
+ sudo apt-get update
+ sudo apt-get -qy install bwa make build-essential cmake libncurses-dev ncurses-dev libbz2-dev lzma-dev liblzma-dev \
+ curl libssl-dev libtool autoconf automake libcurl4-openssl-dev
+
+
+ - name: Install d4
+ git clone https://github.com/38/d4-format
+ cd d4-format
+ cargo build --release
+ sudo cp ../d4-format/target/release/libd4binding.* /usr/local/lib
+ sudo cp ./d4binding/include/d4.h /usr/local/include/
+
+ # Setup htslib
+ - name: Install htslib (linux)
+ if: runner.os == 'Linux'
+ run: |
+ cd
+ git clone --recursive https://github.com/samtools/htslib.git
+ cd htslib && git checkout 1.10 && autoheader && autoconf && ./configure --enable-libcurl
+ cd
+ make -j 4 -C htslib
+ echo "::set-env name=LD_LIBRARY_PATH::${LD_LIBRARY_PATH}:${HOME}/htslib"
+ ls -lh $HOME/htslib/*.so
+
+ - name: Install hstlib (macos)
+ if: runner.os == 'macOS'
+ run: |
+ brew install htslib
+
+ - uses: iffy/install-nim at v1.1
+ with:
+ nimversion: ${{ matrix.nimversion }}
+
+ - uses: actions-rs/toolchain at v1
+ with:
+ toolchain: stable
+ - uses: actions-rs/cargo at v1
+
+
+ # Build and Test
+ - name: Build test executable
+ run: nimble build -Y mosdepth.nimble
+
+ - name: "Copy binary"
+ run: chmod +x mosdepth && mkdir bin && cp mosdepth bin/mosdepth_debug_${{ matrix.os }}
+
+ - name: "Build and Copy release binary"
+ run: nim c -d:danger -d:release -o:bin/mosdepth_${{ matrix.os }} mosdepth
+
+ - name: Functional Tests
+ env:
+ TERM: "xterm"
+ run: |
+ bash ./functional-tests.sh
+
+ - name: Unit Tests
+ run: |
+ nim c -r tests/all.nim
+
+ - name: Upload Artifact
+ if: success()
+ uses: actions/upload-artifact at v1.0.0
+ with:
+ name: mosdepth_${{ matrix.os }}_executable
+ path: bin/
=====================================
.gitignore
=====================================
@@ -1,4 +1,5 @@
nimcache
+*.d4
*.bam
*.cram
*.bai
@@ -6,3 +7,6 @@ nimcache
profile_results.txt
t.*.bed.gz
t.*.bed.gz.csi
+*.bed.gz*
+*.dist.txt
+*.summary.txt
=====================================
.travis.yml
=====================================
@@ -3,7 +3,7 @@ env:
- BRANCH=devel
before_install:
- - export LD_LIBRARY_PATH=./htslib/
+ - export LD_LIBRARY_PATH=./htslib-1.10.2/:/usr/local/lib
- export PATH="$TRAVIS_BUILD_DIR/nim-$BRANCH/bin:$PATH"
- export NIM_LIB_PREFIX=$TRAVIS_BUILD_DIR/nim-$BRANCH/
@@ -12,6 +12,7 @@ install:
script:
- nimble test
- bash functional-tests.sh
+ - nim c -r depthstat.nim
- nim c -d:release --cc:$CC mosdepth.nim
- ./mosdepth -h
branches:
=====================================
CHANGES.md
=====================================
@@ -1,4 +1,37 @@
-0.2.5 (dev)
+v0.3.1
+======
++ fix bug with regions and d4 that would cause error even when --d4 was not used.
+
+v0.3.0
+======
++ allow chromosome names containing '-' as arguments for -c
++ d4 output
+
+0.2.9
+=====
++ modifies region.dist.txt to contain the aggregate coverage of each window when -b (integer) is specified
+ (otherwise region.dist.txt and global.disk.txt are identical with -b (integer) )
++ improve speed by ~30% when using per-base output with better int2str method
+
+0.2.8
+=====
++ fix off-by-one error in CSI index (but not data) of output bed files (#98)
+
+0.2.7
+=====
++ small optimizations
++ exit with 1 on bad help #80
++ fix check on remote bam (brentp/hts-nim#48)
++ fix erroneous assert #99
++ update static binary to htslib 1.10 (this fixes other bugs reported and closed in mosdepth)
+
+0.2.6
+=====
++ fix #54. for quantize
++ add summary file output (implemented by @danielecook)
++ add `--median` flag to output the median for each region from --by (default is to use mean).
+
+0.2.5
=====
+ remove dependency on PCRE
+ don't double count fully overlapping reads (thanks to @jaudoux for the fix in #73)
=====================================
README.md
=====================================
@@ -4,7 +4,8 @@ fast BAM/CRAM depth calculation for **WGS**, **exome**, or **targeted sequencing
![logo](https://user-images.githubusercontent.com/1739/29678184-da1f384c-88ba-11e7-9d98-df4fe3a59924.png "logo")
-[![Build Status](https://travis-ci.org/brentp/mosdepth.svg?branch=master)](https://travis-ci.org/brentp/mosdepth)
+[![Build](https://github.com/brentp/mosdepth/workflows/Build/badge.svg?branch=master)](https://github.com/brentp/mosdepth/actions?query=workflow%3ABuild)
+[![Build Status](https://travis-ci.com/brentp/mosdepth.svg?branch=master)](https://travis-ci.com/brentp/mosdepth)
[![citation](https://img.shields.io/badge/cite-open%20access-orange.svg)](https://academic.oup.com/bioinformatics/article/doi/10.1093/bioinformatics/btx699/4583630?guestAccessKey=35b55064-4566-4ab3-a769-32916fa1c6e6)
@@ -13,24 +14,28 @@ fast BAM/CRAM depth calculation for **WGS**, **exome**, or **targeted sequencing
+ per-base depth about 2x as fast `samtools depth`--about 25 minutes of CPU time for a 30X genome.
+ mean per-window depth given a window size--as would be used for CNV calling.
+ the mean per-region given a BED file of regions.
-+ a distribution of proportion of bases covered at or above a given threshhold for each chromosome and genome-wide.
+* the mean or median per-region cumulative coverage histogram given a window size
++ a distribution of proportion of bases covered at or above a given threshold for each chromosome and genome-wide.
+ quantized output that merges adjacent bases as long as they fall in the same coverage bins e.g. (10-20)
+ threshold output to indicate how many bases in each region are covered at the given thresholds.
++ A summary of mean depths per chromosome and within specified regions per chromosome.
++ a [d4](https://github.com/38/d4-format) file (better than bigwig).
when appropriate, the output files are bgzipped and indexed for ease of use.
## usage
```
-mosdepth 0.2.3
+mosdepth 0.3.0
Usage: mosdepth [options] <prefix> <BAM-or-CRAM>
Arguments:
- <prefix> outputs: `{prefix}.mosdepth.global.dist.txt` (NOTE!!! this is changed in version 0.2.2)
+ <prefix> outputs: `{prefix}.mosdepth.global.dist.txt`
+ `{prefix}.mosdepth.summary.txt`
`{prefix}.mosdepth.region.dist.txt` (if --by is specified)
- `{prefix}.per-base.bed.gz` (unless -n/--no-per-base is specified)
+ `{prefix}.per-base.bed.gz|per-base.d4` (unless -n/--no-per-base is specified)
`{prefix}.regions.bed.gz` (if --by is specified)
`{prefix}.quantized.bed.gz` (if --quantize is specified)
`{prefix}.thresholds.bed.gz` (if --thresholds is specified)
@@ -45,6 +50,8 @@ Common Options:
-n --no-per-base dont output per-base depth. skipping this output will speed execution
substantially. prefer quantized or thresholded values if possible.
-f --fasta <fasta> fasta file for use with CRAM files.
+ --d4 output per-base depth in d4 format. This is much faster.
+
Other options:
@@ -52,7 +59,7 @@ Other options:
-i --include-flag <FLAG> only include reads with any of the bits in FLAG set. default is unset. [default: 0]
-x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases).
-q --quantize <segments> write quantized output see docs for description.
- -Q --mapq <mapq> mapping quality threshold [default: 0]
+ -Q --mapq <mapq> mapping quality threshold. reads with a mapping quality less than this are ignored [default: 0]
-T --thresholds <thresholds> for each interval in --by, write number of bases covered by at
least threshold bases. Specify multiple integer values separated
by ','.
@@ -131,9 +138,21 @@ Output will go to `$sample.mosdepth.dist.txt`
This also forces the output to have 5 decimals of precision rather than the default of 2.
+## D4
+
+D4 is a format created by [Hao Hou](https://github.com/38) in the Quinlan lab. It is
+incorporated into `mosdepth` as of version 0.3.0 for per-base output with the `--d4` flag.
+It improves write speed dramatically; for one test-case it takes **24.8s** to write a
+per-base.bed.gz with mosdepth compared to **7.7s** to write a d4 file. For the same case,
+running `mosdepth` without writing per-base takes 5.9 seconds so D4 greatly mitigates
+the cost of outputing per-base depth **and** the output is more useful.
+
## Installation
-The simplest way is to [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square)](http://bioconda.github.io/recipes/mosdepth/README.html)
+
+The simplest option is to download the [binary from the releases](https://github.com/brentp/mosdepth/releases).
+
+Another quick way is to [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square)](http://bioconda.github.io/recipes/mosdepth/README.html)
It can also be installed with `brew` as `brew install brewsci/bio/mosdepth` or used via docker with quay:
```
@@ -141,10 +160,8 @@ docker pull quay.io/biocontainers/mosdepth:0.2.4--he527e40_0
docker run -v /hostpath/:/opt/mount quay.io/biocontainers/mosdepth:0.2.4--he527e40_0 mosdepth -n --fast-mode -t 4 --by 1000 /opt/mount/sample /opt/mount/$bam
```
-Unless you want to install [nim](https://nim-lang.org), simply download the
-[binary from the releases](https://github.com/brentp/mosdepth/releases).
-
-`mosdepth` uses requires htslib version 1.4 or later. If you get an error
+The binary from releases is static, with no dependencies. If you build it yourself,
+`mosdepth` requires htslib version 1.4 or later. If you get an error
about "`libhts.so` not found", set `LD_LIBRARY_PATH` to the directory that
contains `libhts.so`. e.g.
@@ -153,25 +170,6 @@ contains `libhts.so`. e.g.
If you get the error `could not import: hts_check_EOF` you may need to
install a more recent version of htslib.
-`mosdepth` also requires a recent version of [PCRE](https://www.pcre.org/), and will give the error `could not import: pcre_free_study` if the version of PCRE on your system is too old.
-If you do not have root and cannot get the system version of PCRE upgraded, you download and compile a local copy
-```
-cd ~/src/
-wget ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-8.41.tar.gz
-tar zxvf pcre-8.41.tar.gz
-cd pcre-8.41/
-./configure
-make
-```
-
-Then pass that path to mosdepth just like we did with htslib
-```
-LD_LIBRARY_PATH=~/src/pcre-8.41/.libs/:/~/src/htslib/ mosdepth -h
-```
-
-If you still see an error about `could not import: pcre_free_study` then
-for some, the solution has been to do: `ln -s /usr/local/lib/libpcre.so /usr/local/lib/libpcre.so.3`
-
If you do want to install from source, see the [travis.yml](https://github.com/brentp/mosdepth/blob/master/.travis.yml)
and the [install.sh](https://github.com/brentp/mosdepth/blob/master/scripts/install.sh).
@@ -187,7 +185,7 @@ for at least a given coverage value. It does this for each chromosome, and for t
whole genome.
Each row will indicate:
- + chromosome (or "genome")
+ + chromosome (or "total")
+ coverage level
+ proportion of bases covered at that level
=====================================
debian/TODO
=====================================
@@ -0,0 +1,3 @@
+Need to integrate https://github.com/38/d4-format
+
+ -- Steffen Mon, 02 Nov 2020 22:26:57 +0100
=====================================
debian/changelog
=====================================
@@ -1,11 +1,5 @@
-mosdepth (0.2.5-1) UNRELEASED; urgency=medium
+mosdepth (0.3.1-1) UNRELEASED; urgency=medium
- * Initial release (Closes: #<bug>)
- TODO: several nim libraries would be needed - giving up here
- https://github.com/brentp/hts-nim
- https://github.com/docopt/docopt.nim
- https://github.com/nitely/nim-regex
- https://github.com/nitely/nim-unicodedb
- ...
+ * Initial release (Closes: #973658)
- -- Andreas Tille <tille at debian.org> Sat, 13 Apr 2019 10:32:57 +0200
+ -- Steffen Moeller <moeller at debian.org> Sun, 31 Mar 2019 17:25:57 +0200
=====================================
debian/control
=====================================
@@ -1,34 +1,42 @@
Source: mosdepth
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
Section: science
Priority: optional
+Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
+Uploaders: Steffen Moeller <moeller at debian.org>
Build-Depends: debhelper (>= 12~),
nim,
- libpcre3-dev
-Standards-Version: 4.3.0
+ nim-docopt,
+ nim-regex,
+ nim-hts,
+ help2man
+Standards-Version: 4.5.0
Vcs-Browser: https://salsa.debian.org/med-team/mosdepth
Vcs-Git: https://salsa.debian.org/med-team/mosdepth.git
Homepage: https://github.com/brentp/mosdepth
+Rules-Requires-Root: no
Package: mosdepth
Architecture: any
-Depends: ${shlibs:Depends},
- ${misc:Depends}
-Description: fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing
- mosdepth can output:
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Description: BAM/CRAM depth calculation biological sequencing
+ Many small reads are produced by high-throughput "next generation"
+ sequencing technologies. The final sequence is derived from how
+ these reads are overlapping towards a consensus.
+ The more reads are covering/confirming parts of a nucleotide seq,
+ the higher the confidence is. Too many reads would be indicative
+ of e.g. repeats in the genome.
.
- * per-base depth about 2x as fast samtools depth--about 25 minutes of
- CPU time for a 30X genome.
- * mean per-window depth given a window size--as would be used for
- CNV calling.
- * the mean per-region given a BED file of regions.
- * a distribution of proportion of bases covered at or above a given
- threshhold for each chromosome and genome-wide.
- * quantized output that merges adjacent bases as long as they fall in
- the same coverage bins e.g. (10-20)
- * threshold output to indicate how many bases in each region are
- covered at the given thresholds.
- .
- when appropriate, the output files are bgzipped and indexed for
- ease of use.
+ mosdepth can output:
+ * per-base depth about 2x as fast samtools depth--about 25 minutes
+ of CPU time for a 30X genome.
+ * mean per-window depth given a window size--as would be used for
+ CNV calling.
+ * the mean per-region given a BED file of regions.
+ * a distribution of proportion of bases covered at or above a given
+ threshhold for each chromosome and genome-wide.
+ * quantized output that merges adjacent bases as long as they fall
+ in the same coverage bins e.g. (10-20)
+ * threshold output to indicate how many bases in each region are
+ covered at the given thresholds.
+ when appropriate, the output files are bgzipped and indexed for ease
+ of use.
=====================================
debian/copyright
=====================================
@@ -1,16 +1,22 @@
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: mosdepth
-Source: https://github.com/brentp/mosdepth/releases
+Source: https://github.com/brentp/mosdepth
Files: *
-Copyright: 2017 Brent Pedersen
-License: MIT
+Copyright: 2017-2020 Brent Pedersen
+License: Expat
+
+Files: intToStr.nim
+Copyright: 2014 Milo Yip
+ adpated from: https://github.com/miloyip/itoa-benchmark countlut method..
+ https://github.com/miloyip/itoa-benchmark/blob/master/license.txt
+License: Expat
Files: debian/*
-Copyright: 2019 Andreas Tille <tille at debian.org>
-License: MIT
+Copyright: 2019-2020 Steffen Moeller <moeller at debian.org>
+License: Expat
-License: MIT
+License: Expat
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
@@ -28,3 +34,4 @@ License: MIT
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+
=====================================
debian/manpages
=====================================
@@ -0,0 +1 @@
+debian/mosdepth.1
=====================================
debian/mosdepth.install
=====================================
@@ -0,0 +1 @@
+mosdepth usr/bin
=====================================
debian/rules
=====================================
@@ -1,30 +1,17 @@
#!/usr/bin/make -f
-# DH_VERBOSE := 1
+ DH_VERBOSE := 1
export LC_ALL=C.UTF-8
include /usr/share/dpkg/default.mk
-# this provides:
-# DEB_SOURCE: the source package name
-# DEB_VERSION: the full version of the package (epoch + upstream vers. + revision)
-# DEB_VERSION_EPOCH_UPSTREAM: the package's version without the Debian revision
-# DEB_VERSION_UPSTREAM_REVISION: the package's version without the Debian epoch
-# DEB_VERSION_UPSTREAM: the package's upstream version
-# DEB_DISTRIBUTION: the distribution(s) listed in the current entry of debian/changelog
-# SOURCE_DATE_EPOCH: the source release date as seconds since the epoch, as
-# specified by <https://reproducible-builds.org/specs/source-date-epoch/>
-
-# for hardening you might like to uncomment this:
-# export DEB_BUILD_MAINT_OPTIONS=hardening=+all
%:
dh $@
override_dh_auto_build:
- nimble build -y
+ nim c -p:/usr/share/nimble/unicodeplus -p:/usr/share/nimble/unicodedb -p:/usr/share/nimble/regex -p:/usr/share/nimble/docopt -p:/usr/share/nimble/hts mosdepth.nim
+ help2man mosdepth > debian/mosdepth.1
-### When overriding auto_test make sure DEB_BUILD_OPTIONS will be respected
-#override_dh_auto_test:
-#ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
-# do_stuff_for_testing
-#endif
+override_dh_auto_clean:
+ rm -f debian/mosdepth.1
+ rm -f mosdepth
=====================================
debian/upstream/metadata
=====================================
@@ -1,20 +1,18 @@
Reference:
- - Author: Brent S Pedersen and Aaron R Quinlan
- Title: "Mosdepth: quick coverage calculation for genomes and exomes"
+ - Author: Brent S Pedersen and Aaron R. Quinlan
+ Title: >
+ Mosdepth: quick coverage calculation for genomes and exomes
Journal: Bioinformatics
Year: 2018
Volume: 34
Number: 5
- Pages: 867–868
+ Pages: 867-868
DOI: 10.1093/bioinformatics/btx699
PMID: 29096012
- URL: https://academic.oup.com/bioinformatics/article/34/5/867/4583630
- eprint: "https://academic.oup.com/bioinformatics/article-pdf/\
- 34/5/867/25331748/btx699.pdf"
+ URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6030888/
+ eprint: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6030888/pdf/btx699.pdf
Registry:
- Name: bio.tools
Entry: mosdepth
- - Name: OMICtools
- Entry: OMICS_20873
- Name: conda:bioconda
Entry: mosdepth
=====================================
debian/watch
=====================================
@@ -1,3 +1,4 @@
version=4
+#https://github.com/brentp/mospdepth/releases/latest .*/archive/#PREFIX#@ANY_VERSION@@ARCHIVE_EXT@
+https://github.com/brentp/mosdepth/tags .*/v([\d.])+\.tar\.gz
-https://github.com/brentp/mosdepth/releases .*/archive/v?@ANY_VERSION@@ARCHIVE_EXT@
=====================================
depthstat.nim
=====================================
@@ -0,0 +1,81 @@
+
+type
+ depth_stat* = object
+ cum_length*: int
+ cum_depth*: uint64
+ min_depth*: uint32
+ max_depth*: uint32
+
+type CountStat*[T:SomeOrdinal] = object
+ counts: seq[T]
+ n: int
+
+proc initCountStat*[T](size:int=32768): CountStat[T] =
+ return CountStat[T](counts: newSeq[T](size))
+
+proc add*[T](c: var CountStat, value: T) {.inline.} =
+ c.n.inc
+ if value.int > c.counts.high.int:
+ c.counts[c.counts.high].inc
+ elif value < 0:
+ raise newException(IndexError, "error setting negative depth value:" & $value)
+ else:
+ c.counts[value].inc
+
+proc median*[T](c: CountStat[T]): int {.inline.} =
+ var stop_n = int(0.5 + c.n.float64 * 0.5)
+ var cum = 0
+ for i, cnt in c.counts:
+ cum += cnt.int
+ if cum >= stop_n:
+ return i
+ return -1
+
+proc clear*[T](c: var CountStat[T]) {.inline.} =
+ if c.n == 0: return
+ c.n = 0
+ zeroMem(c.counts[0].addr, sizeof(c.counts[0]) * c.counts.len)
+
+template len*[T](c:CountStat[T]): int = c.counts.len
+
+proc newDepthStat*[T: SomeNumber](d: seq[T]): depth_stat =
+ result.cum_length = len(d)
+ result.min_depth = uint32.high
+ for dp in d:
+ result.cum_depth += dp.uint64
+ result.min_depth = min(result.min_depth, dp.uint32)
+ result.max_depth = max(result.max_depth, dp.uint32)
+
+proc clear*(ds: var depth_stat) =
+ ds.cum_length = 0
+ ds.cum_depth = 0
+ ds.min_depth = uint32.high
+ ds.max_depth = 0
+
+proc `+`*(a, b: depth_stat): depth_stat {.inline.} =
+ result = depth_stat(cum_length: a.cum_length + b.cum_length,
+ cum_depth: a.cum_depth + b.cum_depth,
+ min_depth: min(a.min_depth, b.min_depth),
+ max_depth: max(a.max_depth, b.max_depth))
+
+when isMainModule:
+ import unittest
+ suite "count-stat":
+ test "count-test":
+
+ var c = initCountStat[uint32]()
+ for i in 0..10:
+ c.add(i.uint32)
+
+ check c.median == 5
+ check c.n == 11
+ check c.len == 32768
+
+ c.add(6)
+ c.add(6)
+ c.add(6)
+ c.add(6)
+ check c.median == 6
+ c.clear()
+ check c.median == 0
+ check c.n == 0
=====================================
functional-tests.sh
=====================================
@@ -10,8 +10,10 @@ test -e ssshtest || wget -q https://raw.githubusercontent.com/ryanlayer/ssshtest
set -o nounset
+
set -e
nim c --boundChecks:on -x:on mosdepth.nim
+nim c -r tests/funcs.nim
set +e
exe=./mosdepth
bam=/data/human/NA12878.subset.bam
@@ -26,12 +28,16 @@ assert_exit_code 0
assert_equal "$(zgrep ^MT t.per-base.bed.gz)" "MT 0 80 1
MT 80 16569 0"
assert_equal "$(zgrep -w ^1 t.per-base.bed.gz)" "1 0 249250621 0"
+assert_equal "$(cat t.mosdepth.summary.txt | grep 'MT')" "MT 16569 80 0.00 0 1"
+assert_equal "$(cat t.mosdepth.summary.txt | grep 'total')" "total 16569 80 0.00 0 1"
run overlapFastMode $exe t --fast-mode tests/ovl.bam
assert_equal "$(zgrep ^MT t.per-base.bed.gz)" "MT 0 6 1
MT 6 42 2
MT 42 80 1
MT 80 16569 0"
+assert_equal $(cat t.mosdepth.summary.txt | cut -f 2 | tail -n+2 | sort | uniq) 16569
+assert_equal $(cat t.mosdepth.summary.txt | cut -f 4 | tail -n+2 | sort | uniq) "0.01"
assert_exit_code 0
@@ -41,7 +47,7 @@ assert_exit_code 1
run unordered_bed $exe --by tests/unordered.bed t tests/ovl.bam
assert_exit_code 0
-assert_equal $(zcat t.regions.bed.gz | wc -l) 2
+assert_equal $(zcat < t.regions.bed.gz | wc -l) 2
# theres data left in the bam but the region tree is empty...
run missing_bed_chrom $exe --by tests/missing.bed t tests/ovl.bam
@@ -68,13 +74,13 @@ assert_exit_code 0
rm -f t.thresholds.bed.gz*
run threshold_test $exe --by 100 -T 0,1,2,3,4,5 -c MT t tests/ovl.bam
-assert_equal "$(zcat t.thresholds.bed.gz | tail -n +2 | head -1)" "MT 0 100 unknown 100 80 0 0 0 0"
-assert_equal "0" "$(zcat t.thresholds.bed.gz | tail -n+2 | cut -f 7 | uniq)"
+assert_equal "$(zcat < t.thresholds.bed.gz | tail -n +2 | head -1)" "MT 0 100 unknown 100 80 0 0 0 0"
+assert_equal "0" "$(zcat < t.thresholds.bed.gz | tail -n+2 | cut -f 7 | uniq)"
assert_exit_code 0
rm -f t.thresholds.bed.gz*
run threshold_test_by $exe --by tests/track.bed -T 0,1,2 -c MT t tests/ovl.bam
-assert_equal "$(zcat t.thresholds.bed.gz | tail -n +2)" "MT 2 80 aregion 78 78 0"
+assert_equal "$(zcat < t.thresholds.bed.gz | tail -n +2)" "MT 2 80 aregion 78 78 0"
assert_exit_code 0
export MOSDEPTH_Q0=AAA
@@ -85,10 +91,13 @@ assert_exit_code 0
assert_equal "$(zgrep -w ^MT t.quantized.bed.gz)" "MT 0 80 BBB
MT 80 16569 AAA"
assert_equal "$(head -1 t.mosdepth.global.dist.txt)" "MT 1 0.0048283"
+assert_equal "$(tail -n +2 t.mosdepth.summary.txt | head -1)" "MT 16569 80 0.0048283 0 1"
+assert_equal "$(tail -n 1 t.mosdepth.summary.txt)" "total 16569 80 0.0048283 0 1"
+
run track_header $exe --by tests/track.bed t tests/ovl.bam
assert_exit_code 0
-assert_equal "$(zcat t.regions.bed.gz)" "MT 2 80 aregion 1.00"
+assert_equal "$(zcat < t.regions.bed.gz)" "MT 2 80 aregion 1.00"
run track_header_by $exe --by tests/bad.bed t tests/ovl.bam
assert_exit_code 1
@@ -107,20 +116,38 @@ assert_equal "$(cat tt.mosdepth.global.dist.txt)" "MT 0 1.00
total 0 1.00"
run big_chrom $exe t tests/big.bam
+assert_equal $(cat t.mosdepth.summary.txt | wc -l) 3
+assert_equal $(cat t.mosdepth.summary.txt | cut -f 3 | tail -n +2 | uniq) 15
+assert_equal $(cat t.mosdepth.summary.txt | cut -f 5 | tail -n +2 | uniq) 0
+assert_equal $(cat t.mosdepth.summary.txt | cut -f 6 | tail -n +2 | uniq) 1
assert_exit_code 0
-rm -f tt.mosdepth.region.dist.txt
-rm -f t.mosdepth.region.dist.txt
+rm -f tt.*
+rm -f t.*
+
run empty_tids $exe t -n --thresholds 1,5 --by tests/empty-tids.bed tests/empty-tids.bam
assert_exit_code 0
assert_equal $(grep -w HPV26 -c t.mosdepth.region.dist.txt) 0
+rm -f t.*
+
+#regions and global should not be the same when -b is specified
+run regions $exe t_regions -n -b 100 tests/empty-tids.bam
+assert_equal $(diff -u t_regions.mosdepth.region.dist.txt t_regions.mosdepth.global.dist.txt | wc -l) 1447
+rm -f t_regions.*
-rm -f t.per-base.bed.gz*
run overlappingPairs $exe t tests/overlapping-pairs.bam
-assert_equal "$(zcat t.per-base.bed.gz)" "1 0 565173 0
+assert_equal "$(zcat < t.per-base.bed.gz)" "1 0 565173 0
1 565173 565253 1
1 565253 249250621 0"
assert_exit_code 0
+rm -f t.*
+
+
+nim c -d:d4 --boundChecks:on -x:on mosdepth.nim
+rm -f t.per-base.bed.gz
+run d4test $exe --d4 t --fast-mode tests/ovl.bam
+assert_exit_code 0
+
test -e $bam || exit
@@ -129,4 +156,13 @@ assert_equal "$(zgrep -w "chrM 9999 10000" t.per-base.bed.gz)" "chrM 9999 10000
assert_exit_code 0
run flag $exe -c chrM -F 4 --by 20000 tx /data/human/NA12878.subset.bam
-assert_exit_code 0
\ No newline at end of file
+assert_exit_code 0
+
+
+run check_exit_code_on_bad_args $exe -t4prefix sample.bam
+assert_exit_code 1
+assert_in_stderr "error parsing arguments"
+
+
+
+
=====================================
int2str.nim
=====================================
@@ -0,0 +1,95 @@
+import bitops
+
+#[
+
+this intToStr is adpated from: https://github.com/miloyip/itoa-benchmark countlut method.
+
+https://github.com/miloyip/itoa-benchmark/blob/master/license.txt
+Copyright (C) 2014 Milo Yip
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+]#
+
+
+const gDigitsLut = [
+ '0','0','0','1','0','2','0','3','0','4','0','5','0','6','0','7','0','8','0','9',
+ '1','0','1','1','1','2','1','3','1','4','1','5','1','6','1','7','1','8','1','9',
+ '2','0','2','1','2','2','2','3','2','4','2','5','2','6','2','7','2','8','2','9',
+ '3','0','3','1','3','2','3','3','3','4','3','5','3','6','3','7','3','8','3','9',
+ '4','0','4','1','4','2','4','3','4','4','4','5','4','6','4','7','4','8','4','9',
+ '5','0','5','1','5','2','5','3','5','4','5','5','5','6','5','7','5','8','5','9',
+ '6','0','6','1','6','2','6','3','6','4','6','5','6','6','6','7','6','8','6','9',
+ '7','0','7','1','7','2','7','3','7','4','7','5','7','6','7','7','7','8','7','9',
+ '8','0','8','1','8','2','8','3','8','4','8','5','8','6','8','7','8','8','8','9',
+ '9','0','9','1','9','2','9','3','9','4','9','5','9','6','9','7','9','8','9','9'
+];
+
+const powers_of_10 = [
+ 0'i32,
+ 10,
+ 100,
+ 1000,
+ 10000,
+ 100000,
+ 1000000,
+ 10000000,
+ 100000000,
+ 1000000000]
+
+proc countdigits(value:int32): int {.inline.} =
+ result = (32 - countLeadingZeroBits(value or 1)) * 1233 shr 12
+ result = result - int(value < powers_of_10[result]) + 1
+
+proc fastIntToStr*(value:int32, outstr:var string, offset:int=0) {.inline.} =
+ outstr.setLen(offset + countdigits(value))
+ var value = value
+ var L = outstr.high
+ while value >= 100:
+ let i = (value mod 100) shl 1
+ value = int32(value / 100)
+
+ outstr[L] = gDigitsLut[i + 1]
+ outstr[L-1] = gDigitsLut[i]
+ L -= 2
+ if value < 10:
+ outstr[L] = (value + '0'.int).char
+
+ else:
+ let i = value shl 1
+ outstr[L] = gDigitsLut[i + 1]
+ outstr[L-1] = gDigitsLut[i]
+
+
+when isMainModule:
+
+ import times
+ import strutils
+
+ var t0 = cpuTime()
+ var outstr = newString(10)
+ for i in 0'i32..200_000_000:
+ fastIntToStr(i, outstr)
+ doAssert outstr == $i
+ echo cpuTime() - t0
+
+ t0 = cpuTime()
+ echo "only to 100m for $"
+ for i in 0'i32..100_000_000:
+ let outstr = intToStr(i)
+ echo cpuTime() - t0
=====================================
mosdepth.nim
=====================================
@@ -1,15 +1,23 @@
import hts
import tables
+import ./int2str
import strutils as S
import algorithm as alg
import sequtils as sequtils
import strutils as su
import os
+import strformat
import docopt
import times
+import math
+import ./depthstat
+when defined(d4):
+ import d4
var precision: int
+var output_summary_header = true
+
try:
var tmp = getEnv("MOSDEPTH_PRECISION")
precision = parse_int(tmp)
@@ -20,11 +28,11 @@ type
pair = tuple[pos: int, value: int32]
depth_t = tuple[start: int, stop: int, value: int]
depth_s = tuple[start: int, stop: int, value: string]
- region_t = ref object
- chrom: string
- start: uint32
- stop: uint32
- name: string
+ region_t* = ref object
+ chrom*: string
+ start*: uint32
+ stop*: uint32
+ name*: string
coverage_t = seq[int32]
@@ -38,10 +46,7 @@ proc `$`*(r: region_t): string =
proc to_coverage(c: var coverage_t) =
# to_coverage converts from an array of start/end inc/decs to actual coverage.
- var d = int32(0)
- for i, v in pairs(c):
- d += v
- c[i] = d
+ c.cumsum()
iterator gen_depths(arr: coverage_t, offset: int=0, istop: int=0): depth_t =
# given `arr` with values in each index indicating the number of reads
@@ -125,7 +130,7 @@ iterator gen_quantized(quants: seq[int], arr: coverage_t): depth_s {.inline.} =
yield (last_pos, pos, lookup[last_quantized])
last_quantized = quantized
last_pos = pos
- if last_quantized != -1 and last_pos < arr.high:
+ if last_quantized != -1 and last_pos < arr.high and last_quantized < len(lookup):
yield (last_pos, len(arr)-1, lookup[last_quantized])
proc pair_sort(a, b: pair): int =
@@ -184,24 +189,28 @@ proc bed_line_to_region(line: string): region_t =
s = S.parse_int(cse[1])
e = S.parse_int(cse[2])
reg = region_t(chrom: cse[0], start: uint32(s), stop: uint32(e))
+ doAssert s <= e, "[slivar] ERROR: start > end in bed line:" & line
if len(cse) > 3:
reg.name = cse[3]
return reg
-proc region_line_to_region(region: string): region_t =
+proc region_line_to_region*(region: string): region_t =
if region == "" or region == "nil":
return nil
+ if ':' notin region:
+ return region_t(chrom:region)
+ result = region_t()
+
var i = 0
- var r = region_t()
- for w in region.split({':', '-'}):
+ # rsplit yields strings in reverse order
+ for w in region.rsplit({':', '-'}, maxsplit=2):
if i == 1:
- r.start = uint32(S.parse_int(w)) - 1
- elif i == 2:
- r.stop = uint32(S.parse_int(w))
+ result.start = uint32(S.parse_int(w)) - 1
+ elif i == 0:
+ result.stop = uint32(S.parse_int(w))
else:
- r.chrom = w
+ result.chrom = w
inc(i)
- return r
proc get_tid(tgts: seq[hts.Target], chrom: string): int =
for t in tgts:
@@ -285,11 +294,10 @@ proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t, mapq:int=
# 4623241 4623264
# chr1 4623171 69M1D23M9S (pos: 4623171, value: 1)(pos: 4623241, value: 1)(pos: 4623240, value: -1)(pos: 4623264, value: -1)
# chr1 4623223 4S97M (pos: 4623223, value: 1)(pos: 4623320, value: -1)
- assert (rec.start <= mate.stop)
# each element will have a .value of 1 for start and -1 for end.
- var ses = sequtils.to_seq(gen_start_ends(rec.cigar, rec.start))
- for p in gen_start_ends(mate.cigar, mate.start):
+ var ses = sequtils.to_seq(gen_start_ends(rec.cigar, rec.start.int))
+ for p in gen_start_ends(mate.cigar, mate.start.int):
ses.add(p)
alg.sort(ses, pair_sort)
var pair_depth = 0
@@ -306,10 +314,10 @@ proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t, mapq:int=
last_pos = p.pos
if pair_depth != 0: echo $rec.qname & ":" & $rec & " " & $mate.qname & ":" & $mate & " " & $pair_depth
if fast_mode:
- arr[rec.start].inc
- arr[rec.stop].dec
+ arr[rec.start] += 1
+ arr[rec.stop] -= 1
else:
- inc_coverage(rec.cigar, rec.start, arr)
+ inc_coverage(rec.cigar, rec.start.int, arr)
if not found:
return -2
@@ -352,28 +360,33 @@ iterator region_gen(window: uint32, target: hts.Target, bed_regions: TableRef[st
for r in bed_regions[target.name]: yield r
bed_regions.del(target.name)
-proc imean(vals: coverage_t, start:uint32, stop:uint32): float64 =
+proc imean(vals: coverage_t, start:uint32, stop:uint32, ms:var CountStat[uint32]): float64 =
if start > uint32(len(vals)):
return 0
- var L = float64(stop - start)
- for i in start..<stop:
- if int(i) == len(vals): break
- result += float64(vals[int(i)]) / L
+
+ if ms.len != 0:
+ ms.clear()
+ for i in start..<min(stop, uint32(len(vals))):
+ ms.add(vals[i])
+ return ms.median.float64
+
+ else:
+ var L = float64(stop - start)
+ for i in start..<min(stop, uint32(len(vals))):
+ result += float64(vals[int(i)]) / L
const MAX_COVERAGE = int32(400000)
-proc inc(d: var seq[int64], coverage: coverage_t, start:uint32, stop:uint32) =
+proc inc(d: var seq[int64], coverage: var coverage_t, start:uint32, stop:uint32) =
var v:int32
var L = int32(d.high)
if int(start) >= len(coverage):
stderr.write_line("[mosdepth] warning requested interval outside of chromosome range:", start, "..", stop)
return
- var istop = stop
- if int(stop) > len(coverage):
- istop = uint32(len(coverage))
+ var istop = min(stop, uint32(coverage.len))
for i in start..<istop:
- v = coverage[int(i)]
+ v = coverage[i]
if v > MAX_COVERAGE:
v = MAX_COVERAGE - 10
if v >= L:
@@ -382,7 +395,7 @@ proc inc(d: var seq[int64], coverage: coverage_t, start:uint32, stop:uint32) =
d[j] = 0
L = int32(d.high)
if v < 0: continue
- inc(d[v])
+ d[v] += 1
proc write_distribution(chrom: string, d: var seq[int64], fh:File) =
var sum: int64
@@ -404,6 +417,28 @@ proc write_distribution(chrom: string, d: var seq[int64], fh:File) =
# reverse it back because we use to update the full genome
reverse(d)
+proc write_summary(region: string, stat: depth_stat, fh:File) =
+ var mean_depth: float64
+ if stat.cum_length > 0:
+ mean_depth = float64(stat.cum_depth) / float64(stat.cum_length)
+ else:
+ mean_depth = 0.float64
+ let stat_min = if stat.min_depth == uint32.high: 0.uint32 else: stat.min_depth
+ if output_summary_header:
+ fh.write_line ["chrom",
+ "length",
+ "bases",
+ "mean",
+ "min",
+ "max"].join("\t")
+ output_summary_header = false
+ fh.write_line [region,
+ $stat.cum_length,
+ $stat.cum_depth,
+ $mean_depth.format_float(ffDecimal, precision=precision),
+ $stat_min,
+ $stat.max_depth].join("\t")
+
proc get_targets(targets: seq[hts.Target], r: region_t): seq[hts.Target] =
if r == nil:
return targets
@@ -444,11 +479,11 @@ proc get_quantize_args*(qa: string) : seq[int] =
quit(2)
-proc write_thresholds(fh:BGZI, tid:int, arr:coverage_t, thresholds:seq[int], region: region_t) =
+proc write_thresholds(fh:BGZI, tid:int, arr:var coverage_t, thresholds:seq[int], region: region_t) =
# write the number of bases in each region that are >= each threshold.
if thresholds.len == 0: return
var
- line = new_string_of_cap(100)
+ line = new_string_of_cap(32)
start = int(region.start)
stop = int(region.stop)
line.add(region.chrom & "\t")
@@ -466,13 +501,14 @@ proc write_thresholds(fh:BGZI, tid:int, arr:coverage_t, thresholds:seq[int], reg
return
var counts = new_seq[int](len(thresholds))
+ shallow(arr)
# iterate over the region and count bases >= request cutoffs.
for v in arr[start..<stop]:
for i, t in thresholds:
- if v >= t:
- counts[i] += 1
- # else: break # if we know they are sorted we can break
+ # if we know they are sorted we can break
+ if v < t: break
+ counts[i] += 1
for count in counts:
line.add("\t" & intToStr(count))
@@ -498,8 +534,18 @@ proc get_min_levels(targets: seq[Target]): int =
result += 1
s = s shl 3
+proc isdigit(s:string): bool =
+ for c in s:
+ if not c.isdigit: return false
+ return true
+
+proc to_tuples(targets:seq[Target]): seq[tuple[name:string, length:int]] =
+ result.setLen(targets.len)
+ for i, t in targets:
+ result[i] = (t.name, t.length.int)
-proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16, region: string, thresholds: seq[int], fast_mode:bool, args: Table[string, docopt.Value]) =
+proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16, region: string, thresholds: seq[int],
+ fast_mode:bool, args: Table[string, docopt.Value], use_median:bool=false, use_d4:bool=false) =
# windows are either from regions, or fixed-length windows.
# we assume the input is sorted by chrom.
var
@@ -512,15 +558,25 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16
skip_per_base = args["--no-per-base"]
window: uint32 = 0
bed_regions: TableRef[string, seq[region_t]] # = Table[string, seq[region_t]]
- fbase: BGZI
#fbase: BGZ
fquantize: BGZI
fthresholds: BGZI
fregion: BGZI
fh_global_dist:File
fh_region_dist:File
+ fh_summary: File
quantize = get_quantize_args($args["--quantize"])
+ # summary stat output
+ chrom_region_stat: depth_stat
+ chrom_stat: depth_stat
+ global_region_stat: depth_stat
+ global_stat: depth_stat
+
+ when defined(d4):
+ var fd4:D4
+ var fbase: BGZI
+
var region_distribution = new_seq[int64](1000)
var global_distribution = new_seq[int64](1000)
@@ -534,7 +590,13 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16
if not skip_per_base:
# can't use set-threads when indexing on the fly so this must
# not call set_threads().
- fbase = wopen_bgzi(prefix & ".per-base.bed.gz", 1, 2, 3, true, compression_level=1, levels=levels)
+ if use_d4:
+ when defined(d4):
+ doAssert fd4.open(prefix & ".per-base.d4", mode="w"), &"[mosdepth] error opening {prefix}.per-base.d4"
+ fd4.set_chromosomes(targets.to_tuples)
+
+ else:
+ fbase = wopen_bgzi(prefix & ".per-base.bed.gz", 1, 2, 3, true, compression_level=1, levels=levels)
#open(fbase, prefix & ".per-base.bed.gz", "w1")
if quantize.len != 0:
fquantize = wopen_bgzi(prefix & ".quantized.bed.gz", 1, 2, 3, true, compression_level=1, levels=levels)
@@ -546,6 +608,9 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16
if not open(fh_global_dist, prefix & ".mosdepth.global.dist.txt", fmWrite):
stderr.write_line("[mosdepth] could not open file:", prefix & ".mosdepth.global.dist.txt")
+ if not open(fh_summary, prefix & ".mosdepth.summary.txt", fmWrite):
+ stderr.write_line("[mosdepth] could not open file:", prefix & ".mosdepth.summary.txt")
+
if region != "" and not open(fh_region_dist, prefix & ".mosdepth.region.dist.txt", fmWrite):
stderr.write_line("[mosdepth] could not open file:", prefix & ".mosdepth.dist.txt")
@@ -555,6 +620,9 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16
window = uint32(S.parse_int(region))
else:
bed_regions = bed_to_table(region)
+ shallow(arr)
+
+ var cs = initCountStat[uint32](size=if use_median: 65536 else: 0)
for target in sub_targets:
chrom_global_distribution = new_seq[int64](1000)
@@ -575,7 +643,8 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16
var me = 0'f64
for r in region_gen(window, target, bed_regions):
if tid != -2:
- me = imean(arr, r.start, r.stop)
+ me = imean(arr, r.start, r.stop, cs)
+ chrom_region_stat = chrom_region_stat + newDepthStat(arr[r.start..<r.stop])
var m = su.format_float(me, ffDecimal, precision=precision)
if r.name == "":
@@ -585,10 +654,21 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16
discard fregion.write_interval(line, target.name, int(r.start), int(r.stop))
line = line[0..<0]
if tid != -2:
- chrom_region_distribution.inc(arr, r.start, r.stop)
+ if region.isdigit: #stores the aggregated coverage for each region when working with even windows across the genome
+ chrom_region_distribution[min(me.toInt,int64(len(chrom_region_distribution))-1)] += 1
+ else: # stores the per-base coverage in each region specified in the bed file
+ chrom_region_distribution.inc(arr, r.start, r.stop)
+
write_thresholds(fthresholds, tid, arr, thresholds, r)
if tid != -2:
chrom_global_distribution.inc(arr, uint32(0), uint32(len(arr) - 1))
+ chrom_stat = newDepthStat(arr[0..<len(arr)-1])
+ global_stat = global_stat + chrom_stat
+ write_summary(target.name, chrom_stat, fh_summary)
+ if region != "":
+ write_summary(target.name & "_region", chrom_region_stat, fh_summary)
+ global_region_stat = global_region_stat + chrom_region_stat
+ chrom_region_stat.clear()
# write the distribution for each chrom
write_distribution(target.name, chrom_global_distribution, fh_global_dist)
@@ -603,10 +683,30 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16
if not skip_per_base:
if tid == -2:
- discard fbase.write_interval(starget & "0\t" & intToStr(int(target.length)) & "\t0", target.name, 0, int(target.length))
+ when defined(d4):
+ if use_d4:
+ fd4.write(target.name, @[Interval(left: 0'u32, right: target.length.uint32, value: 0'i32)])
+ else:
+ discard fbase.write_interval(starget & "0\t" & intToStr(int(target.length)) & "\t0", target.name, 0, int(target.length))
else:
- for p in gen_depths(arr):
- discard fbase.write_interval(starget & intToStr(p.start) & "\t" & intToStr(p.stop) & "\t" & intToStr(p.value), target.name, p.start, p.stop)
+ var write_fbase = true
+ when defined(d4):
+ if use_d4:
+ fd4.write(target.name, 0, arr)
+ write_fbase = false
+
+ if write_fbase:
+ var line = newStringOfCap(32)
+ line.add(starget)
+ for p in gen_depths(arr):
+ # re-use line each time.
+ line.setLen(starget.len)
+ fastIntToStr(p.start.int32, line, line.len)
+ line.add('\t')
+ fastIntToStr(p.stop.int32, line, line.len)
+ line.add('\t')
+ fastIntToStr(p.value.int32, line, line.len)
+ discard fbase.write_interval(line, target.name, p.start, p.stop)
if quantize.len != 0:
if tid == -2 and quantize[0] == 0:
var lookup = make_lookup(quantize)
@@ -616,6 +716,10 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16
for p in gen_quantized(quantize, arr):
discard fquantize.write_interval(starget & intToStr(p.start) & "\t" & intToStr(p.stop) & "\t" & p.value, target.name, p.start, p.stop)
+ write_summary("total", global_stat, fh_summary)
+ if region != "":
+ write_summary("total_region", global_region_stat, fh_summary)
+
write_distribution("total", global_distribution, fh_global_dist)
if region != "":
write_distribution("total", region_distribution, fh_region_dist)
@@ -636,6 +740,9 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, eflag: uint16, iflag: uint16
if fthresholds != nil and close(fthresholds) != 0:
stderr.write_line("[mosdepth] error writing thresholds file\n")
quit(1)
+ when defined(d4):
+ if use_d4:
+ fd4.close
if fbase != nil and close(fbase) != 0:
stderr.write_line("[mosdepth] error writing per-base file\n")
@@ -653,7 +760,8 @@ proc check_chrom(r: region_t, targets: seq[Target]) =
proc threshold_args*(ts: string): seq[int] =
if ts == "nil":
return
- return map(ts.split(','), proc (s:string): int = return parse_int(s))
+ result = map(ts.split(','), proc (s:string): int = return parse_int(s))
+ sort(result)
proc check_cram_has_ref(cram_path: string, fasta:string) =
@@ -667,9 +775,9 @@ when(isMainModule):
when not defined(release) and not defined(lto):
stderr.write_line "[mosdepth] WARNING: built in debug mode; will be slow"
- let version = "mosdepth 0.2.5"
+ let version = "mosdepth 0.3.1"
let env_fasta = getEnv("REF_PATH")
- let doc = format("""
+ var doc = format("""
$version
Usage: mosdepth [options] <prefix> <BAM-or-CRAM>
@@ -677,6 +785,7 @@ when(isMainModule):
Arguments:
<prefix> outputs: `{prefix}.mosdepth.dist.txt`
+ `{prefix}.mosdepth.summary.txt`
`{prefix}.per-base.bed.gz` (unless -n/--no-per-base is specified)
`{prefix}.regions.bed.gz` (if --by is specified)
`{prefix}.quantized.bed.gz` (if --quantize is specified)
@@ -692,6 +801,12 @@ Common Options:
-n --no-per-base dont output per-base depth. skipping this output will speed execution
substantially. prefer quantized or thresholded values if possible.
-f --fasta <fasta> fasta file for use with CRAM files [default: $env_fasta].
+""" % ["version", version, "env_fasta", env_fasta])
+ when defined(d4):
+ doc &= """ --d4 output per-base depth in d4 format.
+"""
+
+ doc &= """
Other options:
@@ -699,20 +814,32 @@ Other options:
-i --include-flag <FLAG> only include reads with any of the bits in FLAG set. default is unset. [default: 0]
-x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases).
-q --quantize <segments> write quantized output see docs for description.
- -Q --mapq <mapq> mapping quality threshold [default: 0]
+ -Q --mapq <mapq> mapping quality threshold. reads with a quality less than this value are ignored [default: 0]
-T --thresholds <thresholds> for each interval in --by, write number of bases covered by at
least threshold bases. Specify multiple integer values separated
by ','.
+ -m --use-median output median of each region (in --by) instead of mean.
-R --read-groups <string> only calculate depth for these comma-separated read groups IDs.
-h --help show help
- """ % ["version", version, "env_fasta", env_fasta])
+ """
+
+ var args: Table[string, Value]
+ try:
+ args = docopt(doc, version = version, quit=false)
+ except DocoptExit:
+ echo (ref DocoptExit)(get_current_exception()).usage
+ quit "error parsing arguments"
- let args = docopt(doc, version = version)
let mapq = S.parse_int($args["--mapq"])
var
region: string
thresholds: seq[int] = threshold_args($args["--thresholds"])
fast_mode:bool = args["--fast-mode"]
+ use_median:bool = args["--use-median"]
+ when defined(d4):
+ var use_d4:bool = args["--d4"] and not args["--no-per-base"]
+ else:
+ var use_d4:bool = false
if $args["--by"] != "nil":
region = $args["--by"]
@@ -748,4 +875,4 @@ Other options:
discard bam.set_option(FormatOption.CRAM_OPT_DECODE_MD, 0)
check_chrom(chrom, bam.hdr.targets)
- main(bam, chrom, mapq, eflag, iflag, region, thresholds, fast_mode, args)
+ main(bam, chrom, mapq, eflag, iflag, region, thresholds, fast_mode, args, use_median=use_median, use_d4=use_d4)
=====================================
mosdepth.nimble
=====================================
@@ -1,13 +1,13 @@
# Package
-version = "0.2.5"
+version = "0.3.1"
author = "Brent Pedersen"
description = "fast depth"
license = "MIT"
# Dependencies
-requires "hts >= 0.2.7", "docopt >= 0.6.8"
+requires "hts >= 0.3.1", "docopt >= 0.6.8", "nim >= 1.0.0", "https://github.com/brentp/d4-nim"
bin = @["mosdepth"]
skipDirs = @["tests"]
=====================================
scripts/install.sh
=====================================
@@ -21,10 +21,26 @@ cd $base
nimble refresh
$base/nim-$BRANCH/bin/nimble install -y
-git clone --recursive https://github.com/samtools/htslib.git
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -qy
+
+export HTSLIB=dynamic
+sudo apt-get update
+sudo apt-get install git llvm curl wget libcurl4-openssl-dev
+wget https://github.com/samtools/htslib/archive/1.10.2.tar.gz
+tar xzf 1.10.2.tar.gz
+cd htslib-1.10.2/
+autoheader && autoconf && ./configure --enable-libcurl
+sudo make -j4 install
+git clone https://github.com/38/d4-format
+cd d4-format
+~/.cargo/bin/cargo build --release
+sudo cp ../d4-format/target/release/libd4binding.* /usr/local/lib
+sudo cp ./d4binding/include/d4.h /usr/local/include/
+sudo ldconfig
+
-cd htslib && git checkout 1.9 && autoheader && autoconf && ./configure --enable-libcurl
cd ..
-make -j 4 -C htslib
-export LD_LIBRARY_PATH=$base/htslib
-ls -lh $base/htslib/*.so
+export LD_LIBRARY_PATH=$base/htslib-1.10.2
+ls -lh $base/htslib-1.10.2/*.so
+
+nimble install -y https://github.com/brentp/d4-nim/
=====================================
scripts/plot-dist.py
=====================================
@@ -19,6 +19,9 @@ def main():
for chrom, data in it.groupby(gen, itemgetter(0)):
if chrom.startswith("GL"):
continue
+ if "Un" in chrom: continue
+ if "random" in chrom or "HLA" in chrom: continue
+ if chrom.endswith("alt"): continue
chroms[chrom] = True
xs, ys = [], []
v50 = 0
=====================================
tests/funcs.nim
=====================================
@@ -1,4 +1,5 @@
-import unittest, mosdepth as mosdepth
+import unittest
+import mosdepth
import os
suite "mosdepth-suite":
@@ -64,6 +65,14 @@ suite "mosdepth-suite":
check ts[2] == 3
check ts.len == 3
+ test "name splitting":
+ var r = region_line_to_region("Super-Scaffold_52")
+ check r.chrom == "Super-Scaffold_52"
+ check r.start == 0
+ check r.stop == 0
-
+ r = region_line_to_region("Super-Scaffold_52:2-1000")
+ check r.chrom == "Super-Scaffold_52"
+ check r.start == 1
+ check r.stop == 1000
View it on GitLab: https://salsa.debian.org/med-team/mosdepth/-/compare/dc8b42ddb9c296679a4e8b61be6ea85274348a20...8f0e04158b3e6f1105334ede4173cd8add3d8623
--
View it on GitLab: https://salsa.debian.org/med-team/mosdepth/-/compare/dc8b42ddb9c296679a4e8b61be6ea85274348a20...8f0e04158b3e6f1105334ede4173cd8add3d8623
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201102/8027eafa/attachment-0001.html>
More information about the debian-med-commit
mailing list